LLVM 20.0.0git
SLPVectorizer.cpp
Go to the documentation of this file.
1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
26#include "llvm/ADT/SetVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/iterator.h"
51#include "llvm/IR/Attributes.h"
52#include "llvm/IR/BasicBlock.h"
53#include "llvm/IR/Constant.h"
54#include "llvm/IR/Constants.h"
55#include "llvm/IR/DataLayout.h"
57#include "llvm/IR/Dominators.h"
58#include "llvm/IR/Function.h"
59#include "llvm/IR/IRBuilder.h"
60#include "llvm/IR/InstrTypes.h"
61#include "llvm/IR/Instruction.h"
64#include "llvm/IR/Intrinsics.h"
65#include "llvm/IR/Module.h"
66#include "llvm/IR/Operator.h"
68#include "llvm/IR/Type.h"
69#include "llvm/IR/Use.h"
70#include "llvm/IR/User.h"
71#include "llvm/IR/Value.h"
72#include "llvm/IR/ValueHandle.h"
73#ifdef EXPENSIVE_CHECKS
74#include "llvm/IR/Verifier.h"
75#endif
76#include "llvm/Pass.h"
81#include "llvm/Support/Debug.h"
92#include <algorithm>
93#include <cassert>
94#include <cstdint>
95#include <iterator>
96#include <memory>
97#include <optional>
98#include <set>
99#include <string>
100#include <tuple>
101#include <utility>
102
103using namespace llvm;
104using namespace llvm::PatternMatch;
105using namespace slpvectorizer;
106
107#define SV_NAME "slp-vectorizer"
108#define DEBUG_TYPE "SLP"
109
110STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
111
112static cl::opt<bool>
113 RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
114 cl::desc("Run the SLP vectorization passes"));
115
116static cl::opt<bool>
117 SLPReVec("slp-revec", cl::init(false), cl::Hidden,
118 cl::desc("Enable vectorization for wider vector utilization"));
119
120static cl::opt<int>
122 cl::desc("Only vectorize if you gain more than this "
123 "number "));
124
126 "slp-skip-early-profitability-check", cl::init(false), cl::Hidden,
127 cl::desc("When true, SLP vectorizer bypasses profitability checks based on "
128 "heuristics and makes vectorization decision via cost modeling."));
129
130static cl::opt<bool>
131ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
132 cl::desc("Attempt to vectorize horizontal reductions"));
133
135 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
136 cl::desc(
137 "Attempt to vectorize horizontal reductions feeding into a store"));
138
139static cl::opt<int>
141 cl::desc("Attempt to vectorize for this register size in bits"));
142
145 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
146
147/// Limits the size of scheduling regions in a block.
148/// It avoid long compile times for _very_ large blocks where vector
149/// instructions are spread over a wide range.
150/// This limit is way higher than needed by real-world functions.
151static cl::opt<int>
152ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
153 cl::desc("Limit the size of the SLP scheduling region per block"));
154
156 "slp-min-reg-size", cl::init(128), cl::Hidden,
157 cl::desc("Attempt to vectorize for this register size in bits"));
158
160 "slp-recursion-max-depth", cl::init(12), cl::Hidden,
161 cl::desc("Limit the recursion depth when building a vectorizable tree"));
162
164 "slp-min-tree-size", cl::init(3), cl::Hidden,
165 cl::desc("Only vectorize small trees if they are fully vectorizable"));
166
167// The maximum depth that the look-ahead score heuristic will explore.
168// The higher this value, the higher the compilation time overhead.
170 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
171 cl::desc("The maximum look-ahead depth for operand reordering scores"));
172
173// The maximum depth that the look-ahead score heuristic will explore
174// when it probing among candidates for vectorization tree roots.
175// The higher this value, the higher the compilation time overhead but unlike
176// similar limit for operands ordering this is less frequently used, hence
177// impact of higher value is less noticeable.
179 "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
180 cl::desc("The maximum look-ahead depth for searching best rooting option"));
181
183 "slp-min-strided-loads", cl::init(2), cl::Hidden,
184 cl::desc("The minimum number of loads, which should be considered strided, "
185 "if the stride is > 1 or is runtime value"));
186
188 "slp-max-stride", cl::init(8), cl::Hidden,
189 cl::desc("The maximum stride, considered to be profitable."));
190
191static cl::opt<bool>
192 ViewSLPTree("view-slp-tree", cl::Hidden,
193 cl::desc("Display the SLP trees with Graphviz"));
194
196 "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
197 cl::desc("Try to vectorize with non-power-of-2 number of elements."));
198
199// Limit the number of alias checks. The limit is chosen so that
200// it has no negative effect on the llvm benchmarks.
201static const unsigned AliasedCheckLimit = 10;
202
203// Limit of the number of uses for potentially transformed instructions/values,
204// used in checks to avoid compile-time explode.
205static constexpr int UsesLimit = 64;
206
207// Another limit for the alias checks: The maximum distance between load/store
208// instructions where alias checks are done.
209// This limit is useful for very large basic blocks.
210static const unsigned MaxMemDepDistance = 160;
211
212/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
213/// regions to be handled.
214static const int MinScheduleRegionSize = 16;
215
216/// Maximum allowed number of operands in the PHI nodes.
217static const unsigned MaxPHINumOperands = 128;
218
219/// Predicate for the element types that the SLP vectorizer supports.
220///
221/// The most important thing to filter here are types which are invalid in LLVM
222/// vectors. We also filter target specific types which have absolutely no
223/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
224/// avoids spending time checking the cost model and realizing that they will
225/// be inevitably scalarized.
226static bool isValidElementType(Type *Ty) {
227 // TODO: Support ScalableVectorType.
229 Ty = Ty->getScalarType();
230 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
231 !Ty->isPPC_FP128Ty();
232}
233
234/// Returns the type of the given value/instruction \p V. If it is store,
235/// returns the type of its value operand, for Cmp - the types of the compare
236/// operands and for insertelement - the type os the inserted operand.
237/// Otherwise, just the type of the value is returned.
238template <typename T> static Type *getValueType(T *V) {
239 if (auto *SI = dyn_cast<StoreInst>(V))
240 return SI->getValueOperand()->getType();
241 if (auto *CI = dyn_cast<CmpInst>(V))
242 return CI->getOperand(0)->getType();
243 if (auto *IE = dyn_cast<InsertElementInst>(V))
244 return IE->getOperand(1)->getType();
245 return V->getType();
246}
247
248/// \returns the number of elements for Ty.
249static unsigned getNumElements(Type *Ty) {
251 "ScalableVectorType is not supported.");
252 if (auto *VecTy = dyn_cast<FixedVectorType>(Ty))
253 return VecTy->getNumElements();
254 return 1;
255}
256
257/// \returns the vector type of ScalarTy based on vectorization factor.
258static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
259 return FixedVectorType::get(ScalarTy->getScalarType(),
260 VF * getNumElements(ScalarTy));
261}
262
263/// Returns the number of elements of the given type \p Ty, not less than \p Sz,
264/// which forms type, which splits by \p TTI into whole vector types during
265/// legalization.
267 Type *Ty, unsigned Sz) {
268 if (!isValidElementType(Ty))
269 return bit_ceil(Sz);
270 // Find the number of elements, which forms full vectors.
271 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
272 if (NumParts == 0 || NumParts >= Sz)
273 return bit_ceil(Sz);
274 return bit_ceil(divideCeil(Sz, NumParts)) * NumParts;
275}
276
277static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
278 SmallVectorImpl<int> &Mask) {
279 // The ShuffleBuilder implementation use shufflevector to splat an "element".
280 // But the element have different meaning for SLP (scalar) and REVEC
281 // (vector). We need to expand Mask into masks which shufflevector can use
282 // directly.
283 SmallVector<int> NewMask(Mask.size() * VecTyNumElements);
284 for (unsigned I : seq<unsigned>(Mask.size()))
285 for (auto [J, MaskV] : enumerate(MutableArrayRef(NewMask).slice(
286 I * VecTyNumElements, VecTyNumElements)))
287 MaskV = Mask[I] == PoisonMaskElem ? PoisonMaskElem
288 : Mask[I] * VecTyNumElements + J;
289 Mask.swap(NewMask);
290}
291
292/// \returns the number of groups of shufflevector
293/// A group has the following features
294/// 1. All of value in a group are shufflevector.
295/// 2. The mask of all shufflevector is isExtractSubvectorMask.
296/// 3. The mask of all shufflevector uses all of the elements of the source.
297/// e.g., it is 1 group (%0)
298/// %1 = shufflevector <16 x i8> %0, <16 x i8> poison,
299/// <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
300/// %2 = shufflevector <16 x i8> %0, <16 x i8> poison,
301/// <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
302/// it is 2 groups (%3 and %4)
303/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
304/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
305/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
306/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
307/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
308/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
309/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
310/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
311/// it is 0 group
312/// %12 = shufflevector <8 x i16> %10, <8 x i16> poison,
313/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
314/// %13 = shufflevector <8 x i16> %11, <8 x i16> poison,
315/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
317 if (VL.empty())
318 return 0;
320 return 0;
321 auto *SV = cast<ShuffleVectorInst>(VL.front());
322 unsigned SVNumElements =
323 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
324 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
325 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
326 if (GroupSize == 0 || (VL.size() % GroupSize) != 0)
327 return 0;
328 unsigned NumGroup = 0;
329 for (size_t I = 0, E = VL.size(); I != E; I += GroupSize) {
330 auto *SV = cast<ShuffleVectorInst>(VL[I]);
331 Value *Src = SV->getOperand(0);
332 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
333 SmallBitVector ExpectedIndex(GroupSize);
334 if (!all_of(Group, [&](Value *V) {
335 auto *SV = cast<ShuffleVectorInst>(V);
336 // From the same source.
337 if (SV->getOperand(0) != Src)
338 return false;
339 int Index;
340 if (!SV->isExtractSubvectorMask(Index))
341 return false;
342 ExpectedIndex.set(Index / ShuffleMaskSize);
343 return true;
344 }))
345 return 0;
346 if (!ExpectedIndex.all())
347 return 0;
348 ++NumGroup;
349 }
350 assert(NumGroup == (VL.size() / GroupSize) && "Unexpected number of groups");
351 return NumGroup;
352}
353
354/// \returns a shufflevector mask which is used to vectorize shufflevectors
355/// e.g.,
356/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
357/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
358/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
359/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
360/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
361/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
362/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
363/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
364/// the result is
365/// <0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31>
367 assert(getShufflevectorNumGroups(VL) && "Not supported shufflevector usage.");
368 auto *SV = cast<ShuffleVectorInst>(VL.front());
369 unsigned SVNumElements =
370 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
371 SmallVector<int> Mask;
372 unsigned AccumulateLength = 0;
373 for (Value *V : VL) {
374 auto *SV = cast<ShuffleVectorInst>(V);
375 for (int M : SV->getShuffleMask())
376 Mask.push_back(M == PoisonMaskElem ? PoisonMaskElem
377 : AccumulateLength + M);
378 AccumulateLength += SVNumElements;
379 }
380 return Mask;
381}
382
383/// \returns True if the value is a constant (but not globals/constant
384/// expressions).
385static bool isConstant(Value *V) {
387}
388
389/// Checks if \p V is one of vector-like instructions, i.e. undef,
390/// insertelement/extractelement with constant indices for fixed vector type or
391/// extractvalue instruction.
395 return false;
396 auto *I = dyn_cast<Instruction>(V);
397 if (!I || isa<ExtractValueInst>(I))
398 return true;
399 if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
400 return false;
402 return isConstant(I->getOperand(1));
403 assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
404 return isConstant(I->getOperand(2));
405}
406
407/// Returns power-of-2 number of elements in a single register (part), given the
408/// total number of elements \p Size and number of registers (parts) \p
409/// NumParts.
410static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
411 return std::min<unsigned>(Size, bit_ceil(divideCeil(Size, NumParts)));
412}
413
414/// Returns correct remaining number of elements, considering total amount \p
415/// Size, (power-of-2 number) of elements in a single register \p PartNumElems
416/// and current register (part) \p Part.
417static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
418 unsigned Part) {
419 return std::min<unsigned>(PartNumElems, Size - Part * PartNumElems);
420}
421
422#if !defined(NDEBUG)
423/// Print a short descriptor of the instruction bundle suitable for debug output.
424static std::string shortBundleName(ArrayRef<Value *> VL, int Idx = -1) {
425 std::string Result;
426 raw_string_ostream OS(Result);
427 if (Idx >= 0)
428 OS << "Idx: " << Idx << ", ";
429 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
430 return Result;
431}
432#endif
433
434/// \returns true if all of the instructions in \p VL are in the same block or
435/// false otherwise.
438 if (!I0)
439 return false;
441 return true;
442
443 BasicBlock *BB = I0->getParent();
444 for (int I = 1, E = VL.size(); I < E; I++) {
445 auto *II = dyn_cast<Instruction>(VL[I]);
446 if (!II)
447 return false;
448
449 if (BB != II->getParent())
450 return false;
451 }
452 return true;
453}
454
455/// \returns True if all of the values in \p VL are constants (but not
456/// globals/constant expressions).
458 // Constant expressions and globals can't be vectorized like normal integer/FP
459 // constants.
460 return all_of(VL, isConstant);
461}
462
463/// \returns True if all of the values in \p VL are identical or some of them
464/// are UndefValue.
465static bool isSplat(ArrayRef<Value *> VL) {
466 Value *FirstNonUndef = nullptr;
467 for (Value *V : VL) {
468 if (isa<UndefValue>(V))
469 continue;
470 if (!FirstNonUndef) {
471 FirstNonUndef = V;
472 continue;
473 }
474 if (V != FirstNonUndef)
475 return false;
476 }
477 return FirstNonUndef != nullptr;
478}
479
480/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
482 if (auto *Cmp = dyn_cast<CmpInst>(I))
483 return Cmp->isCommutative();
484 if (auto *BO = dyn_cast<BinaryOperator>(I))
485 return BO->isCommutative() ||
486 (BO->getOpcode() == Instruction::Sub &&
487 !BO->hasNUsesOrMore(UsesLimit) &&
488 all_of(
489 BO->uses(),
490 [](const Use &U) {
491 // Commutative, if icmp eq/ne sub, 0
492 ICmpInst::Predicate Pred;
493 if (match(U.getUser(),
494 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
495 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
496 return true;
497 // Commutative, if abs(sub nsw, true) or abs(sub, false).
498 ConstantInt *Flag;
499 return match(U.getUser(),
500 m_Intrinsic<Intrinsic::abs>(
501 m_Specific(U.get()), m_ConstantInt(Flag))) &&
502 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
503 Flag->isOne());
504 })) ||
505 (BO->getOpcode() == Instruction::FSub &&
506 !BO->hasNUsesOrMore(UsesLimit) &&
507 all_of(BO->uses(), [](const Use &U) {
508 return match(U.getUser(),
509 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
510 }));
511 return I->isCommutative();
512}
513
514template <typename T>
515static std::optional<unsigned> getInsertExtractIndex(const Value *Inst,
516 unsigned Offset) {
517 static_assert(std::is_same_v<T, InsertElementInst> ||
518 std::is_same_v<T, ExtractElementInst>,
519 "unsupported T");
520 int Index = Offset;
521 if (const auto *IE = dyn_cast<T>(Inst)) {
522 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
523 if (!VT)
524 return std::nullopt;
525 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
526 if (!CI)
527 return std::nullopt;
528 if (CI->getValue().uge(VT->getNumElements()))
529 return std::nullopt;
530 Index *= VT->getNumElements();
531 Index += CI->getZExtValue();
532 return Index;
533 }
534 return std::nullopt;
535}
536
537/// \returns inserting or extracting index of InsertElement, ExtractElement or
538/// InsertValue instruction, using Offset as base offset for index.
539/// \returns std::nullopt if the index is not an immediate.
540static std::optional<unsigned> getElementIndex(const Value *Inst,
541 unsigned Offset = 0) {
543 return Index;
545 return Index;
546
547 int Index = Offset;
548
549 const auto *IV = dyn_cast<InsertValueInst>(Inst);
550 if (!IV)
551 return std::nullopt;
552
553 Type *CurrentType = IV->getType();
554 for (unsigned I : IV->indices()) {
555 if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
556 Index *= ST->getNumElements();
557 CurrentType = ST->getElementType(I);
558 } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
559 Index *= AT->getNumElements();
560 CurrentType = AT->getElementType();
561 } else {
562 return std::nullopt;
563 }
564 Index += I;
565 }
566 return Index;
567}
568
569namespace {
570/// Specifies the way the mask should be analyzed for undefs/poisonous elements
571/// in the shuffle mask.
572enum class UseMask {
573 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
574 ///< check for the mask elements for the first argument (mask
575 ///< indices are in range [0:VF)).
576 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
577 ///< for the mask elements for the second argument (mask indices
578 ///< are in range [VF:2*VF))
579 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
580 ///< future shuffle elements and mark them as ones as being used
581 ///< in future. Non-undef elements are considered as unused since
582 ///< they're already marked as used in the mask.
583};
584} // namespace
585
586/// Prepares a use bitset for the given mask either for the first argument or
587/// for the second.
589 UseMask MaskArg) {
590 SmallBitVector UseMask(VF, true);
591 for (auto [Idx, Value] : enumerate(Mask)) {
592 if (Value == PoisonMaskElem) {
593 if (MaskArg == UseMask::UndefsAsMask)
594 UseMask.reset(Idx);
595 continue;
596 }
597 if (MaskArg == UseMask::FirstArg && Value < VF)
598 UseMask.reset(Value);
599 else if (MaskArg == UseMask::SecondArg && Value >= VF)
600 UseMask.reset(Value - VF);
601 }
602 return UseMask;
603}
604
605/// Checks if the given value is actually an undefined constant vector.
606/// Also, if the \p UseMask is not empty, tries to check if the non-masked
607/// elements actually mask the insertelement buildvector, if any.
608template <bool IsPoisonOnly = false>
610 const SmallBitVector &UseMask = {}) {
611 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
612 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
613 if (isa<T>(V))
614 return Res;
615 auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
616 if (!VecTy)
617 return Res.reset();
618 auto *C = dyn_cast<Constant>(V);
619 if (!C) {
620 if (!UseMask.empty()) {
621 const Value *Base = V;
622 while (auto *II = dyn_cast<InsertElementInst>(Base)) {
623 Base = II->getOperand(0);
624 if (isa<T>(II->getOperand(1)))
625 continue;
626 std::optional<unsigned> Idx = getElementIndex(II);
627 if (!Idx) {
628 Res.reset();
629 return Res;
630 }
631 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
632 Res.reset(*Idx);
633 }
634 // TODO: Add analysis for shuffles here too.
635 if (V == Base) {
636 Res.reset();
637 } else {
638 SmallBitVector SubMask(UseMask.size(), false);
639 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
640 }
641 } else {
642 Res.reset();
643 }
644 return Res;
645 }
646 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
647 if (Constant *Elem = C->getAggregateElement(I))
648 if (!isa<T>(Elem) &&
649 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
650 Res.reset(I);
651 }
652 return Res;
653}
654
655/// Checks if the vector of instructions can be represented as a shuffle, like:
656/// %x0 = extractelement <4 x i8> %x, i32 0
657/// %x3 = extractelement <4 x i8> %x, i32 3
658/// %y1 = extractelement <4 x i8> %y, i32 1
659/// %y2 = extractelement <4 x i8> %y, i32 2
660/// %x0x0 = mul i8 %x0, %x0
661/// %x3x3 = mul i8 %x3, %x3
662/// %y1y1 = mul i8 %y1, %y1
663/// %y2y2 = mul i8 %y2, %y2
664/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
665/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
666/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
667/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
668/// ret <4 x i8> %ins4
669/// can be transformed into:
670/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
671/// i32 6>
672/// %2 = mul <4 x i8> %1, %1
673/// ret <4 x i8> %2
674/// Mask will return the Shuffle Mask equivalent to the extracted elements.
675/// TODO: Can we split off and reuse the shuffle mask detection from
676/// ShuffleVectorInst/getShuffleCost?
677static std::optional<TargetTransformInfo::ShuffleKind>
679 const auto *It = find_if(VL, IsaPred<ExtractElementInst>);
680 if (It == VL.end())
681 return std::nullopt;
682 unsigned Size =
683 std::accumulate(VL.begin(), VL.end(), 0u, [](unsigned S, Value *V) {
684 auto *EI = dyn_cast<ExtractElementInst>(V);
685 if (!EI)
686 return S;
687 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
688 if (!VTy)
689 return S;
690 return std::max(S, VTy->getNumElements());
691 });
692
693 Value *Vec1 = nullptr;
694 Value *Vec2 = nullptr;
695 bool HasNonUndefVec = any_of(VL, [](Value *V) {
696 auto *EE = dyn_cast<ExtractElementInst>(V);
697 if (!EE)
698 return false;
699 Value *Vec = EE->getVectorOperand();
700 if (isa<UndefValue>(Vec))
701 return false;
702 return isGuaranteedNotToBePoison(Vec);
703 });
704 enum ShuffleMode { Unknown, Select, Permute };
705 ShuffleMode CommonShuffleMode = Unknown;
706 Mask.assign(VL.size(), PoisonMaskElem);
707 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
708 // Undef can be represented as an undef element in a vector.
709 if (isa<UndefValue>(VL[I]))
710 continue;
711 auto *EI = cast<ExtractElementInst>(VL[I]);
712 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
713 return std::nullopt;
714 auto *Vec = EI->getVectorOperand();
715 // We can extractelement from undef or poison vector.
717 continue;
718 // All vector operands must have the same number of vector elements.
719 if (isa<UndefValue>(Vec)) {
720 Mask[I] = I;
721 } else {
722 if (isa<UndefValue>(EI->getIndexOperand()))
723 continue;
724 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
725 if (!Idx)
726 return std::nullopt;
727 // Undefined behavior if Idx is negative or >= Size.
728 if (Idx->getValue().uge(Size))
729 continue;
730 unsigned IntIdx = Idx->getValue().getZExtValue();
731 Mask[I] = IntIdx;
732 }
733 if (isUndefVector(Vec).all() && HasNonUndefVec)
734 continue;
735 // For correct shuffling we have to have at most 2 different vector operands
736 // in all extractelement instructions.
737 if (!Vec1 || Vec1 == Vec) {
738 Vec1 = Vec;
739 } else if (!Vec2 || Vec2 == Vec) {
740 Vec2 = Vec;
741 Mask[I] += Size;
742 } else {
743 return std::nullopt;
744 }
745 if (CommonShuffleMode == Permute)
746 continue;
747 // If the extract index is not the same as the operation number, it is a
748 // permutation.
749 if (Mask[I] % Size != I) {
750 CommonShuffleMode = Permute;
751 continue;
752 }
753 CommonShuffleMode = Select;
754 }
755 // If we're not crossing lanes in different vectors, consider it as blending.
756 if (CommonShuffleMode == Select && Vec2)
758 // If Vec2 was never used, we have a permutation of a single vector, otherwise
759 // we have permutation of 2 vectors.
762}
763
764/// \returns True if Extract{Value,Element} instruction extracts element Idx.
765static std::optional<unsigned> getExtractIndex(Instruction *E) {
766 unsigned Opcode = E->getOpcode();
767 assert((Opcode == Instruction::ExtractElement ||
768 Opcode == Instruction::ExtractValue) &&
769 "Expected extractelement or extractvalue instruction.");
770 if (Opcode == Instruction::ExtractElement) {
771 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
772 if (!CI)
773 return std::nullopt;
774 return CI->getZExtValue();
775 }
776 auto *EI = cast<ExtractValueInst>(E);
777 if (EI->getNumIndices() != 1)
778 return std::nullopt;
779 return *EI->idx_begin();
780}
781
782namespace {
783
784/// Main data required for vectorization of instructions.
785struct InstructionsState {
786 /// The very first instruction in the list with the main opcode.
787 Value *OpValue = nullptr;
788
789 /// The main/alternate instruction.
790 Instruction *MainOp = nullptr;
791 Instruction *AltOp = nullptr;
792
793 /// The main/alternate opcodes for the list of instructions.
794 unsigned getOpcode() const {
795 return MainOp ? MainOp->getOpcode() : 0;
796 }
797
798 unsigned getAltOpcode() const {
799 return AltOp ? AltOp->getOpcode() : 0;
800 }
801
802 /// Some of the instructions in the list have alternate opcodes.
803 bool isAltShuffle() const { return AltOp != MainOp; }
804
805 bool isOpcodeOrAlt(Instruction *I) const {
806 unsigned CheckedOpcode = I->getOpcode();
807 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
808 }
809
810 InstructionsState() = delete;
811 InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp)
812 : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
813};
814
815} // end anonymous namespace
816
817/// \returns true if \p Opcode is allowed as part of the main/alternate
818/// instruction for SLP vectorization.
819///
820/// Example of unsupported opcode is SDIV that can potentially cause UB if the
821/// "shuffled out" lane would result in division by zero.
822static bool isValidForAlternation(unsigned Opcode) {
823 if (Instruction::isIntDivRem(Opcode))
824 return false;
825
826 return true;
827}
828
829static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
830 const TargetLibraryInfo &TLI,
831 unsigned BaseIndex = 0);
832
833/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
834/// compatible instructions or constants, or just some other regular values.
835static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
836 Value *Op1, const TargetLibraryInfo &TLI) {
837 return (isConstant(BaseOp0) && isConstant(Op0)) ||
838 (isConstant(BaseOp1) && isConstant(Op1)) ||
839 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
840 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
841 BaseOp0 == Op0 || BaseOp1 == Op1 ||
842 getSameOpcode({BaseOp0, Op0}, TLI).getOpcode() ||
843 getSameOpcode({BaseOp1, Op1}, TLI).getOpcode();
844}
845
846/// \returns true if a compare instruction \p CI has similar "look" and
847/// same predicate as \p BaseCI, "as is" or with its operands and predicate
848/// swapped, false otherwise.
849static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
850 const TargetLibraryInfo &TLI) {
851 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
852 "Assessing comparisons of different types?");
853 CmpInst::Predicate BasePred = BaseCI->getPredicate();
854 CmpInst::Predicate Pred = CI->getPredicate();
856
857 Value *BaseOp0 = BaseCI->getOperand(0);
858 Value *BaseOp1 = BaseCI->getOperand(1);
859 Value *Op0 = CI->getOperand(0);
860 Value *Op1 = CI->getOperand(1);
861
862 return (BasePred == Pred &&
863 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
864 (BasePred == SwappedPred &&
865 areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
866}
867
868/// \returns analysis of the Instructions in \p VL described in
869/// InstructionsState, the Opcode that we suppose the whole list
870/// could be vectorized even if its structure is diverse.
871static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
872 const TargetLibraryInfo &TLI,
873 unsigned BaseIndex) {
874 // Make sure these are all Instructions.
875 if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
876 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
877
878 bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
879 bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
880 bool IsCmpOp = isa<CmpInst>(VL[BaseIndex]);
881 CmpInst::Predicate BasePred =
882 IsCmpOp ? cast<CmpInst>(VL[BaseIndex])->getPredicate()
884 unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
885 unsigned AltOpcode = Opcode;
886 unsigned AltIndex = BaseIndex;
887
888 bool SwappedPredsCompatible = [&]() {
889 if (!IsCmpOp)
890 return false;
891 SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
892 UniquePreds.insert(BasePred);
893 UniqueNonSwappedPreds.insert(BasePred);
894 for (Value *V : VL) {
895 auto *I = dyn_cast<CmpInst>(V);
896 if (!I)
897 return false;
898 CmpInst::Predicate CurrentPred = I->getPredicate();
899 CmpInst::Predicate SwappedCurrentPred =
900 CmpInst::getSwappedPredicate(CurrentPred);
901 UniqueNonSwappedPreds.insert(CurrentPred);
902 if (!UniquePreds.contains(CurrentPred) &&
903 !UniquePreds.contains(SwappedCurrentPred))
904 UniquePreds.insert(CurrentPred);
905 }
906 // Total number of predicates > 2, but if consider swapped predicates
907 // compatible only 2, consider swappable predicates as compatible opcodes,
908 // not alternate.
909 return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
910 }();
911 // Check for one alternate opcode from another BinaryOperator.
912 // TODO - generalize to support all operators (types, calls etc.).
913 auto *IBase = cast<Instruction>(VL[BaseIndex]);
914 Intrinsic::ID BaseID = 0;
915 SmallVector<VFInfo> BaseMappings;
916 if (auto *CallBase = dyn_cast<CallInst>(IBase)) {
918 BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
919 if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
920 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
921 }
922 for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
923 auto *I = cast<Instruction>(VL[Cnt]);
924 unsigned InstOpcode = I->getOpcode();
925 if (IsBinOp && isa<BinaryOperator>(I)) {
926 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
927 continue;
928 if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&
929 isValidForAlternation(Opcode)) {
930 AltOpcode = InstOpcode;
931 AltIndex = Cnt;
932 continue;
933 }
934 } else if (IsCastOp && isa<CastInst>(I)) {
935 Value *Op0 = IBase->getOperand(0);
936 Type *Ty0 = Op0->getType();
937 Value *Op1 = I->getOperand(0);
938 Type *Ty1 = Op1->getType();
939 if (Ty0 == Ty1) {
940 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
941 continue;
942 if (Opcode == AltOpcode) {
944 isValidForAlternation(InstOpcode) &&
945 "Cast isn't safe for alternation, logic needs to be updated!");
946 AltOpcode = InstOpcode;
947 AltIndex = Cnt;
948 continue;
949 }
950 }
951 } else if (auto *Inst = dyn_cast<CmpInst>(VL[Cnt]); Inst && IsCmpOp) {
952 auto *BaseInst = cast<CmpInst>(VL[BaseIndex]);
953 Type *Ty0 = BaseInst->getOperand(0)->getType();
954 Type *Ty1 = Inst->getOperand(0)->getType();
955 if (Ty0 == Ty1) {
956 assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
957 // Check for compatible operands. If the corresponding operands are not
958 // compatible - need to perform alternate vectorization.
959 CmpInst::Predicate CurrentPred = Inst->getPredicate();
960 CmpInst::Predicate SwappedCurrentPred =
961 CmpInst::getSwappedPredicate(CurrentPred);
962
963 if ((E == 2 || SwappedPredsCompatible) &&
964 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
965 continue;
966
967 if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
968 continue;
969 auto *AltInst = cast<CmpInst>(VL[AltIndex]);
970 if (AltIndex != BaseIndex) {
971 if (isCmpSameOrSwapped(AltInst, Inst, TLI))
972 continue;
973 } else if (BasePred != CurrentPred) {
974 assert(
975 isValidForAlternation(InstOpcode) &&
976 "CmpInst isn't safe for alternation, logic needs to be updated!");
977 AltIndex = Cnt;
978 continue;
979 }
980 CmpInst::Predicate AltPred = AltInst->getPredicate();
981 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
982 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
983 continue;
984 }
985 } else if (InstOpcode == Opcode || InstOpcode == AltOpcode) {
986 if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
987 if (Gep->getNumOperands() != 2 ||
988 Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType())
989 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
990 } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
992 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
993 } else if (auto *LI = dyn_cast<LoadInst>(I)) {
994 auto *BaseLI = cast<LoadInst>(IBase);
995 if (!LI->isSimple() || !BaseLI->isSimple())
996 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
997 } else if (auto *Call = dyn_cast<CallInst>(I)) {
998 auto *CallBase = cast<CallInst>(IBase);
1000 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
1002 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
1004 CallBase->op_begin() +
1006 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
1008 if (ID != BaseID)
1009 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
1010 if (!ID) {
1011 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
1012 if (Mappings.size() != BaseMappings.size() ||
1013 Mappings.front().ISA != BaseMappings.front().ISA ||
1014 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
1015 Mappings.front().VectorName != BaseMappings.front().VectorName ||
1016 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
1017 Mappings.front().Shape.Parameters !=
1018 BaseMappings.front().Shape.Parameters)
1019 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
1020 }
1021 }
1022 continue;
1023 }
1024 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
1025 }
1026
1027 return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
1028 cast<Instruction>(VL[AltIndex]));
1029}
1030
1031/// \returns true if all of the values in \p VL have the same type or false
1032/// otherwise.
1034 Type *Ty = VL.front()->getType();
1035 return all_of(VL.drop_front(), [&](Value *V) { return V->getType() == Ty; });
1036}
1037
1038/// \returns True if in-tree use also needs extract. This refers to
1039/// possible scalar operand in vectorized instruction.
1040static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
1041 TargetLibraryInfo *TLI) {
1042 if (!UserInst)
1043 return false;
1044 unsigned Opcode = UserInst->getOpcode();
1045 switch (Opcode) {
1046 case Instruction::Load: {
1047 LoadInst *LI = cast<LoadInst>(UserInst);
1048 return (LI->getPointerOperand() == Scalar);
1049 }
1050 case Instruction::Store: {
1051 StoreInst *SI = cast<StoreInst>(UserInst);
1052 return (SI->getPointerOperand() == Scalar);
1053 }
1054 case Instruction::Call: {
1055 CallInst *CI = cast<CallInst>(UserInst);
1057 return any_of(enumerate(CI->args()), [&](auto &&Arg) {
1058 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index()) &&
1059 Arg.value().get() == Scalar;
1060 });
1061 }
1062 default:
1063 return false;
1064 }
1065}
1066
1067/// \returns the AA location that is being access by the instruction.
1069 if (StoreInst *SI = dyn_cast<StoreInst>(I))
1070 return MemoryLocation::get(SI);
1071 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1072 return MemoryLocation::get(LI);
1073 return MemoryLocation();
1074}
1075
1076/// \returns True if the instruction is not a volatile or atomic load/store.
1077static bool isSimple(Instruction *I) {
1078 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1079 return LI->isSimple();
1080 if (StoreInst *SI = dyn_cast<StoreInst>(I))
1081 return SI->isSimple();
1083 return !MI->isVolatile();
1084 return true;
1085}
1086
1087/// Shuffles \p Mask in accordance with the given \p SubMask.
1088/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
1089/// one but two input vectors.
1090static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
1091 bool ExtendingManyInputs = false) {
1092 if (SubMask.empty())
1093 return;
1094 assert(
1095 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
1096 // Check if input scalars were extended to match the size of other node.
1097 (SubMask.size() == Mask.size() &&
1098 std::all_of(std::next(Mask.begin(), Mask.size() / 2), Mask.end(),
1099 [](int Idx) { return Idx == PoisonMaskElem; }))) &&
1100 "SubMask with many inputs support must be larger than the mask.");
1101 if (Mask.empty()) {
1102 Mask.append(SubMask.begin(), SubMask.end());
1103 return;
1104 }
1105 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
1106 int TermValue = std::min(Mask.size(), SubMask.size());
1107 for (int I = 0, E = SubMask.size(); I < E; ++I) {
1108 if (SubMask[I] == PoisonMaskElem ||
1109 (!ExtendingManyInputs &&
1110 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
1111 continue;
1112 NewMask[I] = Mask[SubMask[I]];
1113 }
1114 Mask.swap(NewMask);
1115}
1116
1117/// Order may have elements assigned special value (size) which is out of
1118/// bounds. Such indices only appear on places which correspond to undef values
1119/// (see canReuseExtract for details) and used in order to avoid undef values
1120/// have effect on operands ordering.
1121/// The first loop below simply finds all unused indices and then the next loop
1122/// nest assigns these indices for undef values positions.
1123/// As an example below Order has two undef positions and they have assigned
1124/// values 3 and 7 respectively:
1125/// before: 6 9 5 4 9 2 1 0
1126/// after: 6 3 5 4 7 2 1 0
1128 const unsigned Sz = Order.size();
1129 SmallBitVector UnusedIndices(Sz, /*t=*/true);
1130 SmallBitVector MaskedIndices(Sz);
1131 for (unsigned I = 0; I < Sz; ++I) {
1132 if (Order[I] < Sz)
1133 UnusedIndices.reset(Order[I]);
1134 else
1135 MaskedIndices.set(I);
1136 }
1137 if (MaskedIndices.none())
1138 return;
1139 assert(UnusedIndices.count() == MaskedIndices.count() &&
1140 "Non-synced masked/available indices.");
1141 int Idx = UnusedIndices.find_first();
1142 int MIdx = MaskedIndices.find_first();
1143 while (MIdx >= 0) {
1144 assert(Idx >= 0 && "Indices must be synced.");
1145 Order[MIdx] = Idx;
1146 Idx = UnusedIndices.find_next(Idx);
1147 MIdx = MaskedIndices.find_next(MIdx);
1148 }
1149}
1150
1151/// \returns a bitset for selecting opcodes. false for Opcode0 and true for
1152/// Opcode1.
1154 unsigned Opcode1) {
1155 Type *ScalarTy = VL[0]->getType();
1156 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
1157 SmallBitVector OpcodeMask(VL.size() * ScalarTyNumElements, false);
1158 for (unsigned Lane : seq<unsigned>(VL.size()))
1159 if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1)
1160 OpcodeMask.set(Lane * ScalarTyNumElements,
1161 Lane * ScalarTyNumElements + ScalarTyNumElements);
1162 return OpcodeMask;
1163}
1164
1165namespace llvm {
1166
1168 SmallVectorImpl<int> &Mask) {
1169 Mask.clear();
1170 const unsigned E = Indices.size();
1171 Mask.resize(E, PoisonMaskElem);
1172 for (unsigned I = 0; I < E; ++I)
1173 Mask[Indices[I]] = I;
1174}
1175
1176/// Reorders the list of scalars in accordance with the given \p Mask.
1178 ArrayRef<int> Mask) {
1179 assert(!Mask.empty() && "Expected non-empty mask.");
1180 SmallVector<Value *> Prev(Scalars.size(),
1181 PoisonValue::get(Scalars.front()->getType()));
1182 Prev.swap(Scalars);
1183 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
1184 if (Mask[I] != PoisonMaskElem)
1185 Scalars[Mask[I]] = Prev[I];
1186}
1187
1188/// Checks if the provided value does not require scheduling. It does not
1189/// require scheduling if this is not an instruction or it is an instruction
1190/// that does not read/write memory and all operands are either not instructions
1191/// or phi nodes or instructions from different blocks.
1193 auto *I = dyn_cast<Instruction>(V);
1194 if (!I)
1195 return true;
1196 return !mayHaveNonDefUseDependency(*I) &&
1197 all_of(I->operands(), [I](Value *V) {
1198 auto *IO = dyn_cast<Instruction>(V);
1199 if (!IO)
1200 return true;
1201 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1202 });
1203}
1204
1205/// Checks if the provided value does not require scheduling. It does not
1206/// require scheduling if this is not an instruction or it is an instruction
1207/// that does not read/write memory and all users are phi nodes or instructions
1208/// from the different blocks.
1209static bool isUsedOutsideBlock(Value *V) {
1210 auto *I = dyn_cast<Instruction>(V);
1211 if (!I)
1212 return true;
1213 // Limits the number of uses to save compile time.
1214 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
1215 all_of(I->users(), [I](User *U) {
1216 auto *IU = dyn_cast<Instruction>(U);
1217 if (!IU)
1218 return true;
1219 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1220 });
1221}
1222
1223/// Checks if the specified value does not require scheduling. It does not
1224/// require scheduling if all operands and all users do not need to be scheduled
1225/// in the current basic block.
1228}
1229
1230/// Checks if the specified array of instructions does not require scheduling.
1231/// It is so if all either instructions have operands that do not require
1232/// scheduling or their users do not require scheduling since they are phis or
1233/// in other basic blocks.
1235 return !VL.empty() &&
1237}
1238
1239/// Returns true if widened type of \p Ty elements with size \p Sz represents
1240/// full vector type, i.e. adding extra element results in extra parts upon type
1241/// legalization.
1243 unsigned Sz) {
1244 if (Sz <= 1)
1245 return false;
1247 return false;
1248 if (has_single_bit(Sz))
1249 return true;
1250 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
1251 return NumParts > 0 && NumParts < Sz && has_single_bit(Sz / NumParts) &&
1252 Sz % NumParts == 0;
1253}
1254
1255namespace slpvectorizer {
1256
1257/// Bottom Up SLP Vectorizer.
1258class BoUpSLP {
1259 struct TreeEntry;
1260 struct ScheduleData;
1263
1264public:
1265 /// Tracks the state we can represent the loads in the given sequence.
1266 enum class LoadsState {
1267 Gather,
1268 Vectorize,
1271 };
1272
1280
1282 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
1285 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1286 AC(AC), DB(DB), DL(DL), ORE(ORE),
1287 Builder(Se->getContext(), TargetFolder(*DL)) {
1288 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
1289 // Use the vector register size specified by the target unless overridden
1290 // by a command-line option.
1291 // TODO: It would be better to limit the vectorization factor based on
1292 // data type rather than just register size. For example, x86 AVX has
1293 // 256-bit registers, but it does not support integer operations
1294 // at that width (that requires AVX2).
1295 if (MaxVectorRegSizeOption.getNumOccurrences())
1296 MaxVecRegSize = MaxVectorRegSizeOption;
1297 else
1298 MaxVecRegSize =
1300 .getFixedValue();
1301
1302 if (MinVectorRegSizeOption.getNumOccurrences())
1303 MinVecRegSize = MinVectorRegSizeOption;
1304 else
1305 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1306 }
1307
1308 /// Vectorize the tree that starts with the elements in \p VL.
1309 /// Returns the vectorized root.
1311
1312 /// Vectorize the tree but with the list of externally used values \p
1313 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
1314 /// generated extractvalue instructions.
1315 Value *
1316 vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
1317 Instruction *ReductionRoot = nullptr);
1318
1319 /// \returns the cost incurred by unwanted spills and fills, caused by
1320 /// holding live values over call sites.
1322
1323 /// \returns the vectorization cost of the subtree that starts at \p VL.
1324 /// A negative number means that this is profitable.
1325 InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = {});
1326
1327 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
1328 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
1329 void buildTree(ArrayRef<Value *> Roots,
1330 const SmallDenseSet<Value *> &UserIgnoreLst);
1331
1332 /// Construct a vectorizable tree that starts at \p Roots.
1333 void buildTree(ArrayRef<Value *> Roots);
1334
1335 /// Returns whether the root node has in-tree uses.
1337 return !VectorizableTree.empty() &&
1338 !VectorizableTree.front()->UserTreeIndices.empty();
1339 }
1340
1341 /// Return the scalars of the root node.
1343 assert(!VectorizableTree.empty() && "No graph to get the first node from");
1344 return VectorizableTree.front()->Scalars;
1345 }
1346
1347 /// Checks if the root graph node can be emitted with narrower bitwidth at
1348 /// codegen and returns it signedness, if so.
1350 return MinBWs.at(VectorizableTree.front().get()).second;
1351 }
1352
1353 /// Builds external uses of the vectorized scalars, i.e. the list of
1354 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
1355 /// ExternallyUsedValues contains additional list of external uses to handle
1356 /// vectorization of reductions.
1357 void
1358 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
1359
1360 /// Transforms graph nodes to target specific representations, if profitable.
1361 void transformNodes();
1362
1363 /// Clear the internal data structures that are created by 'buildTree'.
1364 void deleteTree() {
1365 VectorizableTree.clear();
1366 ScalarToTreeEntry.clear();
1367 MultiNodeScalars.clear();
1368 MustGather.clear();
1369 NonScheduledFirst.clear();
1370 EntryToLastInstruction.clear();
1371 GatheredLoadsEntriesFirst = NoGatheredLoads;
1372 ExternalUses.clear();
1373 ExternalUsesAsOriginalScalar.clear();
1374 for (auto &Iter : BlocksSchedules) {
1375 BlockScheduling *BS = Iter.second.get();
1376 BS->clear();
1377 }
1378 MinBWs.clear();
1379 ReductionBitWidth = 0;
1380 BaseGraphSize = 1;
1381 CastMaxMinBWSizes.reset();
1382 ExtraBitWidthNodes.clear();
1383 InstrElementSize.clear();
1384 UserIgnoreList = nullptr;
1385 PostponedGathers.clear();
1386 ValueToGatherNodes.clear();
1387 }
1388
1389 unsigned getTreeSize() const { return VectorizableTree.size(); }
1390
1391 /// Returns the base graph size, before any transformations.
1392 unsigned getCanonicalGraphSize() const { return BaseGraphSize; }
1393
1394 /// Perform LICM and CSE on the newly generated gather sequences.
1396
1397 /// Does this non-empty order represent an identity order? Identity
1398 /// should be represented as an empty order, so this is used to
1399 /// decide if we can canonicalize a computed order. Undef elements
1400 /// (represented as size) are ignored.
1402 assert(!Order.empty() && "expected non-empty order");
1403 const unsigned Sz = Order.size();
1404 return all_of(enumerate(Order), [&](const auto &P) {
1405 return P.value() == P.index() || P.value() == Sz;
1406 });
1407 }
1408
1409 /// Checks if the specified gather tree entry \p TE can be represented as a
1410 /// shuffled vector entry + (possibly) permutation with other gathers. It
1411 /// implements the checks only for possibly ordered scalars (Loads,
1412 /// ExtractElement, ExtractValue), which can be part of the graph.
1413 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);
1414
1415 /// Sort loads into increasing pointers offsets to allow greater clustering.
1416 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
1417
1418 /// Gets reordering data for the given tree entry. If the entry is vectorized
1419 /// - just return ReorderIndices, otherwise check if the scalars can be
1420 /// reordered and return the most optimal order.
1421 /// \return std::nullopt if ordering is not important, empty order, if
1422 /// identity order is important, or the actual order.
1423 /// \param TopToBottom If true, include the order of vectorized stores and
1424 /// insertelement nodes, otherwise skip them.
1425 std::optional<OrdersType> getReorderingData(const TreeEntry &TE,
1426 bool TopToBottom);
1427
1428 /// Reorders the current graph to the most profitable order starting from the
1429 /// root node to the leaf nodes. The best order is chosen only from the nodes
1430 /// of the same size (vectorization factor). Smaller nodes are considered
1431 /// parts of subgraph with smaller VF and they are reordered independently. We
1432 /// can make it because we still need to extend smaller nodes to the wider VF
1433 /// and we can merge reordering shuffles with the widening shuffles.
1434 void reorderTopToBottom();
1435
1436 /// Reorders the current graph to the most profitable order starting from
1437 /// leaves to the root. It allows to rotate small subgraphs and reduce the
1438 /// number of reshuffles if the leaf nodes use the same order. In this case we
1439 /// can merge the orders and just shuffle user node instead of shuffling its
1440 /// operands. Plus, even the leaf nodes have different orders, it allows to
1441 /// sink reordering in the graph closer to the root node and merge it later
1442 /// during analysis.
1443 void reorderBottomToTop(bool IgnoreReorder = false);
1444
1445 /// \return The vector element size in bits to use when vectorizing the
1446 /// expression tree ending at \p V. If V is a store, the size is the width of
1447 /// the stored value. Otherwise, the size is the width of the largest loaded
1448 /// value reaching V. This method is used by the vectorizer to calculate
1449 /// vectorization factors.
1450 unsigned getVectorElementSize(Value *V);
1451
1452 /// Compute the minimum type sizes required to represent the entries in a
1453 /// vectorizable tree.
1455
1456 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
1457 unsigned getMaxVecRegSize() const {
1458 return MaxVecRegSize;
1459 }
1460
1461 // \returns minimum vector register size as set by cl::opt.
1462 unsigned getMinVecRegSize() const {
1463 return MinVecRegSize;
1464 }
1465
1466 unsigned getMinVF(unsigned Sz) const {
1467 return std::max(2U, getMinVecRegSize() / Sz);
1468 }
1469
1470 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1471 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
1472 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
1473 return MaxVF ? MaxVF : UINT_MAX;
1474 }
1475
1476 /// Check if homogeneous aggregate is isomorphic to some VectorType.
1477 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
1478 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
1479 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
1480 ///
1481 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
1482 unsigned canMapToVector(Type *T) const;
1483
1484 /// \returns True if the VectorizableTree is both tiny and not fully
1485 /// vectorizable. We do not vectorize such trees.
1486 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
1487
1488 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
1489 /// can be load combined in the backend. Load combining may not be allowed in
1490 /// the IR optimizer, so we do not want to alter the pattern. For example,
1491 /// partially transforming a scalar bswap() pattern into vector code is
1492 /// effectively impossible for the backend to undo.
1493 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1494 /// may not be necessary.
1495 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
1496
1497 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
1498 /// can be load combined in the backend. Load combining may not be allowed in
1499 /// the IR optimizer, so we do not want to alter the pattern. For example,
1500 /// partially transforming a scalar bswap() pattern into vector code is
1501 /// effectively impossible for the backend to undo.
1502 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1503 /// may not be necessary.
1504 bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
1505
1506 /// Checks if the given array of loads can be represented as a vectorized,
1507 /// scatter or just simple gather.
1508 /// \param VL list of loads.
1509 /// \param VL0 main load value.
1510 /// \param Order returned order of load instructions.
1511 /// \param PointerOps returned list of pointer operands.
1512 /// \param BestVF return best vector factor, if recursive check found better
1513 /// vectorization sequences rather than masked gather.
1514 /// \param TryRecursiveCheck used to check if long masked gather can be
1515 /// represented as a serie of loads/insert subvector, if profitable.
1518 SmallVectorImpl<Value *> &PointerOps,
1519 unsigned *BestVF = nullptr,
1520 bool TryRecursiveCheck = true) const;
1521
1522 /// Registers non-vectorizable sequence of loads
1523 template <typename T> void registerNonVectorizableLoads(ArrayRef<T *> VL) {
1524 ListOfKnonwnNonVectorizableLoads.insert(hash_value(VL));
1525 }
1526
1527 /// Checks if the given loads sequence is known as not vectorizable
1528 template <typename T>
1530 return ListOfKnonwnNonVectorizableLoads.contains(hash_value(VL));
1531 }
1532
1534
1535 /// This structure holds any data we need about the edges being traversed
1536 /// during buildTree_rec(). We keep track of:
1537 /// (i) the user TreeEntry index, and
1538 /// (ii) the index of the edge.
1539 struct EdgeInfo {
1540 EdgeInfo() = default;
1541 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
1543 /// The user TreeEntry.
1544 TreeEntry *UserTE = nullptr;
1545 /// The operand index of the use.
1546 unsigned EdgeIdx = UINT_MAX;
1547#ifndef NDEBUG
1549 const BoUpSLP::EdgeInfo &EI) {
1550 EI.dump(OS);
1551 return OS;
1552 }
1553 /// Debug print.
1554 void dump(raw_ostream &OS) const {
1555 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
1556 << " EdgeIdx:" << EdgeIdx << "}";
1557 }
1558 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
1559#endif
1560 bool operator == (const EdgeInfo &Other) const {
1561 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
1562 }
1563 };
1564
1565 /// A helper class used for scoring candidates for two consecutive lanes.
1567 const TargetLibraryInfo &TLI;
1568 const DataLayout &DL;
1569 ScalarEvolution &SE;
1570 const BoUpSLP &R;
1571 int NumLanes; // Total number of lanes (aka vectorization factor).
1572 int MaxLevel; // The maximum recursion depth for accumulating score.
1573
1574 public:
1576 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
1577 int MaxLevel)
1578 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
1579 MaxLevel(MaxLevel) {}
1580
1581 // The hard-coded scores listed here are not very important, though it shall
1582 // be higher for better matches to improve the resulting cost. When
1583 // computing the scores of matching one sub-tree with another, we are
1584 // basically counting the number of values that are matching. So even if all
1585 // scores are set to 1, we would still get a decent matching result.
1586 // However, sometimes we have to break ties. For example we may have to
1587 // choose between matching loads vs matching opcodes. This is what these
1588 // scores are helping us with: they provide the order of preference. Also,
1589 // this is important if the scalar is externally used or used in another
1590 // tree entry node in the different lane.
1591
1592 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
1593 static const int ScoreConsecutiveLoads = 4;
1594 /// The same load multiple times. This should have a better score than
1595 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
1596 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
1597 /// a vector load and 1.0 for a broadcast.
1598 static const int ScoreSplatLoads = 3;
1599 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
1600 static const int ScoreReversedLoads = 3;
1601 /// A load candidate for masked gather.
1602 static const int ScoreMaskedGatherCandidate = 1;
1603 /// ExtractElementInst from same vector and consecutive indexes.
1604 static const int ScoreConsecutiveExtracts = 4;
1605 /// ExtractElementInst from same vector and reversed indices.
1606 static const int ScoreReversedExtracts = 3;
1607 /// Constants.
1608 static const int ScoreConstants = 2;
1609 /// Instructions with the same opcode.
1610 static const int ScoreSameOpcode = 2;
1611 /// Instructions with alt opcodes (e.g, add + sub).
1612 static const int ScoreAltOpcodes = 1;
1613 /// Identical instructions (a.k.a. splat or broadcast).
1614 static const int ScoreSplat = 1;
1615 /// Matching with an undef is preferable to failing.
1616 static const int ScoreUndef = 1;
1617 /// Score for failing to find a decent match.
1618 static const int ScoreFail = 0;
1619 /// Score if all users are vectorized.
1620 static const int ScoreAllUserVectorized = 1;
1621
1622 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
1623 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
1624 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
1625 /// MainAltOps.
1627 ArrayRef<Value *> MainAltOps) const {
1628 if (!isValidElementType(V1->getType()) ||
1629 !isValidElementType(V2->getType()))
1631
1632 if (V1 == V2) {
1633 if (isa<LoadInst>(V1)) {
1634 // Retruns true if the users of V1 and V2 won't need to be extracted.
1635 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
1636 // Bail out if we have too many uses to save compilation time.
1637 if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit))
1638 return false;
1639
1640 auto AllUsersVectorized = [U1, U2, this](Value *V) {
1641 return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
1642 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1643 });
1644 };
1645 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1646 };
1647 // A broadcast of a load can be cheaper on some targets.
1648 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1649 ElementCount::getFixed(NumLanes)) &&
1650 ((int)V1->getNumUses() == NumLanes ||
1651 AllUsersAreInternal(V1, V2)))
1653 }
1655 }
1656
1657 auto CheckSameEntryOrFail = [&]() {
1658 if (const TreeEntry *TE1 = R.getTreeEntry(V1);
1659 TE1 && TE1 == R.getTreeEntry(V2))
1662 };
1663
1664 auto *LI1 = dyn_cast<LoadInst>(V1);
1665 auto *LI2 = dyn_cast<LoadInst>(V2);
1666 if (LI1 && LI2) {
1667 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1668 !LI2->isSimple())
1669 return CheckSameEntryOrFail();
1670
1671 std::optional<int> Dist = getPointersDiff(
1672 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1673 LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
1674 if (!Dist || *Dist == 0) {
1675 if (getUnderlyingObject(LI1->getPointerOperand()) ==
1676 getUnderlyingObject(LI2->getPointerOperand()) &&
1677 R.TTI->isLegalMaskedGather(
1678 getWidenedType(LI1->getType(), NumLanes), LI1->getAlign()))
1680 return CheckSameEntryOrFail();
1681 }
1682 // The distance is too large - still may be profitable to use masked
1683 // loads/gathers.
1684 if (std::abs(*Dist) > NumLanes / 2)
1686 // This still will detect consecutive loads, but we might have "holes"
1687 // in some cases. It is ok for non-power-2 vectorization and may produce
1688 // better results. It should not affect current vectorization.
1691 }
1692
1693 auto *C1 = dyn_cast<Constant>(V1);
1694 auto *C2 = dyn_cast<Constant>(V2);
1695 if (C1 && C2)
1697
1698 // Extracts from consecutive indexes of the same vector better score as
1699 // the extracts could be optimized away.
1700 Value *EV1;
1701 ConstantInt *Ex1Idx;
1702 if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
1703 // Undefs are always profitable for extractelements.
1704 // Compiler can easily combine poison and extractelement <non-poison> or
1705 // undef and extractelement <poison>. But combining undef +
1706 // extractelement <non-poison-but-may-produce-poison> requires some
1707 // extra operations.
1708 if (isa<UndefValue>(V2))
1709 return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
1712 Value *EV2 = nullptr;
1713 ConstantInt *Ex2Idx = nullptr;
1714 if (match(V2,
1716 m_Undef())))) {
1717 // Undefs are always profitable for extractelements.
1718 if (!Ex2Idx)
1720 if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
1722 if (EV2 == EV1) {
1723 int Idx1 = Ex1Idx->getZExtValue();
1724 int Idx2 = Ex2Idx->getZExtValue();
1725 int Dist = Idx2 - Idx1;
1726 // The distance is too large - still may be profitable to use
1727 // shuffles.
1728 if (std::abs(Dist) == 0)
1730 if (std::abs(Dist) > NumLanes / 2)
1734 }
1736 }
1737 return CheckSameEntryOrFail();
1738 }
1739
1740 auto *I1 = dyn_cast<Instruction>(V1);
1741 auto *I2 = dyn_cast<Instruction>(V2);
1742 if (I1 && I2) {
1743 if (I1->getParent() != I2->getParent())
1744 return CheckSameEntryOrFail();
1745 SmallVector<Value *, 4> Ops(MainAltOps);
1746 Ops.push_back(I1);
1747 Ops.push_back(I2);
1748 InstructionsState S = getSameOpcode(Ops, TLI);
1749 // Note: Only consider instructions with <= 2 operands to avoid
1750 // complexity explosion.
1751 if (S.getOpcode() &&
1752 (S.MainOp->getNumOperands() <= 2 || !MainAltOps.empty() ||
1753 !S.isAltShuffle()) &&
1754 all_of(Ops, [&S](Value *V) {
1755 return cast<Instruction>(V)->getNumOperands() ==
1756 S.MainOp->getNumOperands();
1757 }))
1758 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
1760 }
1761
1762 if (isa<UndefValue>(V2))
1764
1765 return CheckSameEntryOrFail();
1766 }
1767
1768 /// Go through the operands of \p LHS and \p RHS recursively until
1769 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
1770 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
1771 /// of \p U1 and \p U2), except at the beginning of the recursion where
1772 /// these are set to nullptr.
1773 ///
1774 /// For example:
1775 /// \verbatim
1776 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
1777 /// \ / \ / \ / \ /
1778 /// + + + +
1779 /// G1 G2 G3 G4
1780 /// \endverbatim
1781 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
1782 /// each level recursively, accumulating the score. It starts from matching
1783 /// the additions at level 0, then moves on to the loads (level 1). The
1784 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
1785 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
1786 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
1787 /// Please note that the order of the operands does not matter, as we
1788 /// evaluate the score of all profitable combinations of operands. In
1789 /// other words the score of G1 and G4 is the same as G1 and G2. This
1790 /// heuristic is based on ideas described in:
1791 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
1792 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
1793 /// Luís F. W. Góes
1795 Instruction *U2, int CurrLevel,
1796 ArrayRef<Value *> MainAltOps) const {
1797
1798 // Get the shallow score of V1 and V2.
1799 int ShallowScoreAtThisLevel =
1800 getShallowScore(LHS, RHS, U1, U2, MainAltOps);
1801
1802 // If reached MaxLevel,
1803 // or if V1 and V2 are not instructions,
1804 // or if they are SPLAT,
1805 // or if they are not consecutive,
1806 // or if profitable to vectorize loads or extractelements, early return
1807 // the current cost.
1808 auto *I1 = dyn_cast<Instruction>(LHS);
1809 auto *I2 = dyn_cast<Instruction>(RHS);
1810 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1811 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
1812 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1813 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1815 ShallowScoreAtThisLevel))
1816 return ShallowScoreAtThisLevel;
1817 assert(I1 && I2 && "Should have early exited.");
1818
1819 // Contains the I2 operand indexes that got matched with I1 operands.
1820 SmallSet<unsigned, 4> Op2Used;
1821
1822 // Recursion towards the operands of I1 and I2. We are trying all possible
1823 // operand pairs, and keeping track of the best score.
1824 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1825 OpIdx1 != NumOperands1; ++OpIdx1) {
1826 // Try to pair op1I with the best operand of I2.
1827 int MaxTmpScore = 0;
1828 unsigned MaxOpIdx2 = 0;
1829 bool FoundBest = false;
1830 // If I2 is commutative try all combinations.
1831 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
1832 unsigned ToIdx = isCommutative(I2)
1833 ? I2->getNumOperands()
1834 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1835 assert(FromIdx <= ToIdx && "Bad index");
1836 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1837 // Skip operands already paired with OpIdx1.
1838 if (Op2Used.count(OpIdx2))
1839 continue;
1840 // Recursively calculate the cost at each level
1841 int TmpScore =
1842 getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
1843 I1, I2, CurrLevel + 1, {});
1844 // Look for the best score.
1845 if (TmpScore > LookAheadHeuristics::ScoreFail &&
1846 TmpScore > MaxTmpScore) {
1847 MaxTmpScore = TmpScore;
1848 MaxOpIdx2 = OpIdx2;
1849 FoundBest = true;
1850 }
1851 }
1852 if (FoundBest) {
1853 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
1854 Op2Used.insert(MaxOpIdx2);
1855 ShallowScoreAtThisLevel += MaxTmpScore;
1856 }
1857 }
1858 return ShallowScoreAtThisLevel;
1859 }
1860 };
1861 /// A helper data structure to hold the operands of a vector of instructions.
1862 /// This supports a fixed vector length for all operand vectors.
1864 /// For each operand we need (i) the value, and (ii) the opcode that it
1865 /// would be attached to if the expression was in a left-linearized form.
1866 /// This is required to avoid illegal operand reordering.
1867 /// For example:
1868 /// \verbatim
1869 /// 0 Op1
1870 /// |/
1871 /// Op1 Op2 Linearized + Op2
1872 /// \ / ----------> |/
1873 /// - -
1874 ///
1875 /// Op1 - Op2 (0 + Op1) - Op2
1876 /// \endverbatim
1877 ///
1878 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
1879 ///
1880 /// Another way to think of this is to track all the operations across the
1881 /// path from the operand all the way to the root of the tree and to
1882 /// calculate the operation that corresponds to this path. For example, the
1883 /// path from Op2 to the root crosses the RHS of the '-', therefore the
1884 /// corresponding operation is a '-' (which matches the one in the
1885 /// linearized tree, as shown above).
1886 ///
1887 /// For lack of a better term, we refer to this operation as Accumulated
1888 /// Path Operation (APO).
1889 struct OperandData {
1890 OperandData() = default;
1891 OperandData(Value *V, bool APO, bool IsUsed)
1892 : V(V), APO(APO), IsUsed(IsUsed) {}
1893 /// The operand value.
1894 Value *V = nullptr;
1895 /// TreeEntries only allow a single opcode, or an alternate sequence of
1896 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
1897 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
1898 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
1899 /// (e.g., Add/Mul)
1900 bool APO = false;
1901 /// Helper data for the reordering function.
1902 bool IsUsed = false;
1903 };
1904
1905 /// During operand reordering, we are trying to select the operand at lane
1906 /// that matches best with the operand at the neighboring lane. Our
1907 /// selection is based on the type of value we are looking for. For example,
1908 /// if the neighboring lane has a load, we need to look for a load that is
1909 /// accessing a consecutive address. These strategies are summarized in the
1910 /// 'ReorderingMode' enumerator.
1911 enum class ReorderingMode {
1912 Load, ///< Matching loads to consecutive memory addresses
1913 Opcode, ///< Matching instructions based on opcode (same or alternate)
1914 Constant, ///< Matching constants
1915 Splat, ///< Matching the same instruction multiple times (broadcast)
1916 Failed, ///< We failed to create a vectorizable group
1917 };
1918
1920
1921 /// A vector of operand vectors.
1923
1924 const TargetLibraryInfo &TLI;
1925 const DataLayout &DL;
1926 ScalarEvolution &SE;
1927 const BoUpSLP &R;
1928 const Loop *L = nullptr;
1929
1930 /// \returns the operand data at \p OpIdx and \p Lane.
1931 OperandData &getData(unsigned OpIdx, unsigned Lane) {
1932 return OpsVec[OpIdx][Lane];
1933 }
1934
1935 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
1936 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
1937 return OpsVec[OpIdx][Lane];
1938 }
1939
1940 /// Clears the used flag for all entries.
1941 void clearUsed() {
1942 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
1943 OpIdx != NumOperands; ++OpIdx)
1944 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
1945 ++Lane)
1946 OpsVec[OpIdx][Lane].IsUsed = false;
1947 }
1948
1949 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
1950 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
1951 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
1952 }
1953
1954 /// \param Lane lane of the operands under analysis.
1955 /// \param OpIdx operand index in \p Lane lane we're looking the best
1956 /// candidate for.
1957 /// \param Idx operand index of the current candidate value.
1958 /// \returns The additional score due to possible broadcasting of the
1959 /// elements in the lane. It is more profitable to have power-of-2 unique
1960 /// elements in the lane, it will be vectorized with higher probability
1961 /// after removing duplicates. Currently the SLP vectorizer supports only
1962 /// vectorization of the power-of-2 number of unique scalars.
1963 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx,
1964 const SmallBitVector &UsedLanes) const {
1965 Value *IdxLaneV = getData(Idx, Lane).V;
1966 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V ||
1967 isa<ExtractElementInst>(IdxLaneV))
1968 return 0;
1970 for (unsigned Ln : seq<unsigned>(getNumLanes())) {
1971 if (Ln == Lane)
1972 continue;
1973 Value *OpIdxLnV = getData(OpIdx, Ln).V;
1974 if (!isa<Instruction>(OpIdxLnV))
1975 return 0;
1976 Uniques.try_emplace(OpIdxLnV, Ln);
1977 }
1978 unsigned UniquesCount = Uniques.size();
1979 auto IdxIt = Uniques.find(IdxLaneV);
1980 unsigned UniquesCntWithIdxLaneV =
1981 IdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
1982 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1983 auto OpIdxIt = Uniques.find(OpIdxLaneV);
1984 unsigned UniquesCntWithOpIdxLaneV =
1985 OpIdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
1986 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
1987 return 0;
1988 return std::min(bit_ceil(UniquesCntWithOpIdxLaneV) -
1989 UniquesCntWithOpIdxLaneV,
1990 UniquesCntWithOpIdxLaneV -
1991 bit_floor(UniquesCntWithOpIdxLaneV)) -
1992 ((IdxIt != Uniques.end() && UsedLanes.test(IdxIt->second))
1993 ? UniquesCntWithIdxLaneV - bit_floor(UniquesCntWithIdxLaneV)
1994 : bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
1995 }
1996
1997 /// \param Lane lane of the operands under analysis.
1998 /// \param OpIdx operand index in \p Lane lane we're looking the best
1999 /// candidate for.
2000 /// \param Idx operand index of the current candidate value.
2001 /// \returns The additional score for the scalar which users are all
2002 /// vectorized.
2003 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
2004 Value *IdxLaneV = getData(Idx, Lane).V;
2005 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2006 // Do not care about number of uses for vector-like instructions
2007 // (extractelement/extractvalue with constant indices), they are extracts
2008 // themselves and already externally used. Vectorization of such
2009 // instructions does not add extra extractelement instruction, just may
2010 // remove it.
2011 if (isVectorLikeInstWithConstOps(IdxLaneV) &&
2012 isVectorLikeInstWithConstOps(OpIdxLaneV))
2014 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
2015 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
2016 return 0;
2017 return R.areAllUsersVectorized(IdxLaneI)
2019 : 0;
2020 }
2021
2022 /// Score scaling factor for fully compatible instructions but with
2023 /// different number of external uses. Allows better selection of the
2024 /// instructions with less external uses.
2025 static const int ScoreScaleFactor = 10;
2026
2027 /// \Returns the look-ahead score, which tells us how much the sub-trees
2028 /// rooted at \p LHS and \p RHS match, the more they match the higher the
2029 /// score. This helps break ties in an informed way when we cannot decide on
2030 /// the order of the operands by just considering the immediate
2031 /// predecessors.
2032 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
2033 int Lane, unsigned OpIdx, unsigned Idx,
2034 bool &IsUsed, const SmallBitVector &UsedLanes) {
2035 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
2037 // Keep track of the instruction stack as we recurse into the operands
2038 // during the look-ahead score exploration.
2039 int Score =
2040 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
2041 /*CurrLevel=*/1, MainAltOps);
2042 if (Score) {
2043 int SplatScore = getSplatScore(Lane, OpIdx, Idx, UsedLanes);
2044 if (Score <= -SplatScore) {
2045 // Failed score.
2046 Score = 0;
2047 } else {
2048 Score += SplatScore;
2049 // Scale score to see the difference between different operands
2050 // and similar operands but all vectorized/not all vectorized
2051 // uses. It does not affect actual selection of the best
2052 // compatible operand in general, just allows to select the
2053 // operand with all vectorized uses.
2054 Score *= ScoreScaleFactor;
2055 Score += getExternalUseScore(Lane, OpIdx, Idx);
2056 IsUsed = true;
2057 }
2058 }
2059 return Score;
2060 }
2061
2062 /// Best defined scores per lanes between the passes. Used to choose the
2063 /// best operand (with the highest score) between the passes.
2064 /// The key - {Operand Index, Lane}.
2065 /// The value - the best score between the passes for the lane and the
2066 /// operand.
2068 BestScoresPerLanes;
2069
2070 // Search all operands in Ops[*][Lane] for the one that matches best
2071 // Ops[OpIdx][LastLane] and return its opreand index.
2072 // If no good match can be found, return std::nullopt.
2073 std::optional<unsigned>
2074 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
2075 ArrayRef<ReorderingMode> ReorderingModes,
2076 ArrayRef<Value *> MainAltOps,
2077 const SmallBitVector &UsedLanes) {
2078 unsigned NumOperands = getNumOperands();
2079
2080 // The operand of the previous lane at OpIdx.
2081 Value *OpLastLane = getData(OpIdx, LastLane).V;
2082
2083 // Our strategy mode for OpIdx.
2084 ReorderingMode RMode = ReorderingModes[OpIdx];
2085 if (RMode == ReorderingMode::Failed)
2086 return std::nullopt;
2087
2088 // The linearized opcode of the operand at OpIdx, Lane.
2089 bool OpIdxAPO = getData(OpIdx, Lane).APO;
2090
2091 // The best operand index and its score.
2092 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
2093 // are using the score to differentiate between the two.
2094 struct BestOpData {
2095 std::optional<unsigned> Idx;
2096 unsigned Score = 0;
2097 } BestOp;
2098 BestOp.Score =
2099 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
2100 .first->second;
2101
2102 // Track if the operand must be marked as used. If the operand is set to
2103 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
2104 // want to reestimate the operands again on the following iterations).
2105 bool IsUsed = RMode == ReorderingMode::Splat ||
2106 RMode == ReorderingMode::Constant ||
2107 RMode == ReorderingMode::Load;
2108 // Iterate through all unused operands and look for the best.
2109 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
2110 // Get the operand at Idx and Lane.
2111 OperandData &OpData = getData(Idx, Lane);
2112 Value *Op = OpData.V;
2113 bool OpAPO = OpData.APO;
2114
2115 // Skip already selected operands.
2116 if (OpData.IsUsed)
2117 continue;
2118
2119 // Skip if we are trying to move the operand to a position with a
2120 // different opcode in the linearized tree form. This would break the
2121 // semantics.
2122 if (OpAPO != OpIdxAPO)
2123 continue;
2124
2125 // Look for an operand that matches the current mode.
2126 switch (RMode) {
2127 case ReorderingMode::Load:
2128 case ReorderingMode::Opcode: {
2129 bool LeftToRight = Lane > LastLane;
2130 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
2131 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
2132 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2133 OpIdx, Idx, IsUsed, UsedLanes);
2134 if (Score > static_cast<int>(BestOp.Score) ||
2135 (Score > 0 && Score == static_cast<int>(BestOp.Score) &&
2136 Idx == OpIdx)) {
2137 BestOp.Idx = Idx;
2138 BestOp.Score = Score;
2139 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
2140 }
2141 break;
2142 }
2143 case ReorderingMode::Constant:
2144 if (isa<Constant>(Op) ||
2145 (!BestOp.Score && L && L->isLoopInvariant(Op))) {
2146 BestOp.Idx = Idx;
2147 if (isa<Constant>(Op)) {
2149 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2151 }
2153 IsUsed = false;
2154 }
2155 break;
2156 case ReorderingMode::Splat:
2157 if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Op))) {
2158 IsUsed = Op == OpLastLane;
2159 if (Op == OpLastLane) {
2160 BestOp.Score = LookAheadHeuristics::ScoreSplat;
2161 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2163 }
2164 BestOp.Idx = Idx;
2165 }
2166 break;
2167 case ReorderingMode::Failed:
2168 llvm_unreachable("Not expected Failed reordering mode.");
2169 }
2170 }
2171
2172 if (BestOp.Idx) {
2173 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2174 return BestOp.Idx;
2175 }
2176 // If we could not find a good match return std::nullopt.
2177 return std::nullopt;
2178 }
2179
2180 /// Helper for reorderOperandVecs.
2181 /// \returns the lane that we should start reordering from. This is the one
2182 /// which has the least number of operands that can freely move about or
2183 /// less profitable because it already has the most optimal set of operands.
2184 unsigned getBestLaneToStartReordering() const {
2185 unsigned Min = UINT_MAX;
2186 unsigned SameOpNumber = 0;
2187 // std::pair<unsigned, unsigned> is used to implement a simple voting
2188 // algorithm and choose the lane with the least number of operands that
2189 // can freely move about or less profitable because it already has the
2190 // most optimal set of operands. The first unsigned is a counter for
2191 // voting, the second unsigned is the counter of lanes with instructions
2192 // with same/alternate opcodes and same parent basic block.
2194 // Try to be closer to the original results, if we have multiple lanes
2195 // with same cost. If 2 lanes have the same cost, use the one with the
2196 // lowest index.
2197 for (int I = getNumLanes(); I > 0; --I) {
2198 unsigned Lane = I - 1;
2199 OperandsOrderData NumFreeOpsHash =
2200 getMaxNumOperandsThatCanBeReordered(Lane);
2201 // Compare the number of operands that can move and choose the one with
2202 // the least number.
2203 if (NumFreeOpsHash.NumOfAPOs < Min) {
2204 Min = NumFreeOpsHash.NumOfAPOs;
2205 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2206 HashMap.clear();
2207 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2208 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
2209 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
2210 // Select the most optimal lane in terms of number of operands that
2211 // should be moved around.
2212 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2213 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2214 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
2215 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
2216 auto *It = HashMap.find(NumFreeOpsHash.Hash);
2217 if (It == HashMap.end())
2218 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2219 else
2220 ++It->second.first;
2221 }
2222 }
2223 // Select the lane with the minimum counter.
2224 unsigned BestLane = 0;
2225 unsigned CntMin = UINT_MAX;
2226 for (const auto &Data : reverse(HashMap)) {
2227 if (Data.second.first < CntMin) {
2228 CntMin = Data.second.first;
2229 BestLane = Data.second.second;
2230 }
2231 }
2232 return BestLane;
2233 }
2234
2235 /// Data structure that helps to reorder operands.
2236 struct OperandsOrderData {
2237 /// The best number of operands with the same APOs, which can be
2238 /// reordered.
2239 unsigned NumOfAPOs = UINT_MAX;
2240 /// Number of operands with the same/alternate instruction opcode and
2241 /// parent.
2242 unsigned NumOpsWithSameOpcodeParent = 0;
2243 /// Hash for the actual operands ordering.
2244 /// Used to count operands, actually their position id and opcode
2245 /// value. It is used in the voting mechanism to find the lane with the
2246 /// least number of operands that can freely move about or less profitable
2247 /// because it already has the most optimal set of operands. Can be
2248 /// replaced with SmallVector<unsigned> instead but hash code is faster
2249 /// and requires less memory.
2250 unsigned Hash = 0;
2251 };
2252 /// \returns the maximum number of operands that are allowed to be reordered
2253 /// for \p Lane and the number of compatible instructions(with the same
2254 /// parent/opcode). This is used as a heuristic for selecting the first lane
2255 /// to start operand reordering.
2256 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
2257 unsigned CntTrue = 0;
2258 unsigned NumOperands = getNumOperands();
2259 // Operands with the same APO can be reordered. We therefore need to count
2260 // how many of them we have for each APO, like this: Cnt[APO] = x.
2261 // Since we only have two APOs, namely true and false, we can avoid using
2262 // a map. Instead we can simply count the number of operands that
2263 // correspond to one of them (in this case the 'true' APO), and calculate
2264 // the other by subtracting it from the total number of operands.
2265 // Operands with the same instruction opcode and parent are more
2266 // profitable since we don't need to move them in many cases, with a high
2267 // probability such lane already can be vectorized effectively.
2268 bool AllUndefs = true;
2269 unsigned NumOpsWithSameOpcodeParent = 0;
2270 Instruction *OpcodeI = nullptr;
2271 BasicBlock *Parent = nullptr;
2272 unsigned Hash = 0;
2273 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2274 const OperandData &OpData = getData(OpIdx, Lane);
2275 if (OpData.APO)
2276 ++CntTrue;
2277 // Use Boyer-Moore majority voting for finding the majority opcode and
2278 // the number of times it occurs.
2279 if (auto *I = dyn_cast<Instruction>(OpData.V)) {
2280 if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI).getOpcode() ||
2281 I->getParent() != Parent) {
2282 if (NumOpsWithSameOpcodeParent == 0) {
2283 NumOpsWithSameOpcodeParent = 1;
2284 OpcodeI = I;
2285 Parent = I->getParent();
2286 } else {
2287 --NumOpsWithSameOpcodeParent;
2288 }
2289 } else {
2290 ++NumOpsWithSameOpcodeParent;
2291 }
2292 }
2293 Hash = hash_combine(
2294 Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
2295 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
2296 }
2297 if (AllUndefs)
2298 return {};
2299 OperandsOrderData Data;
2300 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
2301 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2302 Data.Hash = Hash;
2303 return Data;
2304 }
2305
2306 /// Go through the instructions in VL and append their operands.
2307 void appendOperandsOfVL(ArrayRef<Value *> VL) {
2308 assert(!VL.empty() && "Bad VL");
2309 assert((empty() || VL.size() == getNumLanes()) &&
2310 "Expected same number of lanes");
2311 assert(isa<Instruction>(VL[0]) && "Expected instruction");
2312 unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
2313 constexpr unsigned IntrinsicNumOperands = 2;
2314 if (isa<IntrinsicInst>(VL[0]))
2315 NumOperands = IntrinsicNumOperands;
2316 OpsVec.resize(NumOperands);
2317 unsigned NumLanes = VL.size();
2318 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2319 OpsVec[OpIdx].resize(NumLanes);
2320 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2321 assert(isa<Instruction>(VL[Lane]) && "Expected instruction");
2322 // Our tree has just 3 nodes: the root and two operands.
2323 // It is therefore trivial to get the APO. We only need to check the
2324 // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
2325 // RHS operand. The LHS operand of both add and sub is never attached
2326 // to an inversese operation in the linearized form, therefore its APO
2327 // is false. The RHS is true only if VL[Lane] is an inverse operation.
2328
2329 // Since operand reordering is performed on groups of commutative
2330 // operations or alternating sequences (e.g., +, -), we can safely
2331 // tell the inverse operations by checking commutativity.
2332 bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
2333 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
2334 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2335 APO, false};
2336 }
2337 }
2338 }
2339
2340 /// \returns the number of operands.
2341 unsigned getNumOperands() const { return OpsVec.size(); }
2342
2343 /// \returns the number of lanes.
2344 unsigned getNumLanes() const { return OpsVec[0].size(); }
2345
2346 /// \returns the operand value at \p OpIdx and \p Lane.
2347 Value *getValue(unsigned OpIdx, unsigned Lane) const {
2348 return getData(OpIdx, Lane).V;
2349 }
2350
2351 /// \returns true if the data structure is empty.
2352 bool empty() const { return OpsVec.empty(); }
2353
2354 /// Clears the data.
2355 void clear() { OpsVec.clear(); }
2356
2357 /// \Returns true if there are enough operands identical to \p Op to fill
2358 /// the whole vector (it is mixed with constants or loop invariant values).
2359 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
2360 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
2361 bool OpAPO = getData(OpIdx, Lane).APO;
2362 bool IsInvariant = L && L->isLoopInvariant(Op);
2363 unsigned Cnt = 0;
2364 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2365 if (Ln == Lane)
2366 continue;
2367 // This is set to true if we found a candidate for broadcast at Lane.
2368 bool FoundCandidate = false;
2369 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2370 OperandData &Data = getData(OpI, Ln);
2371 if (Data.APO != OpAPO || Data.IsUsed)
2372 continue;
2373 Value *OpILane = getValue(OpI, Lane);
2374 bool IsConstantOp = isa<Constant>(OpILane);
2375 // Consider the broadcast candidate if:
2376 // 1. Same value is found in one of the operands.
2377 if (Data.V == Op ||
2378 // 2. The operand in the given lane is not constant but there is a
2379 // constant operand in another lane (which can be moved to the
2380 // given lane). In this case we can represent it as a simple
2381 // permutation of constant and broadcast.
2382 (!IsConstantOp &&
2383 ((Lns > 2 && isa<Constant>(Data.V)) ||
2384 // 2.1. If we have only 2 lanes, need to check that value in the
2385 // next lane does not build same opcode sequence.
2386 (Lns == 2 &&
2387 !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI)
2388 .getOpcode() &&
2389 isa<Constant>(Data.V)))) ||
2390 // 3. The operand in the current lane is loop invariant (can be
2391 // hoisted out) and another operand is also a loop invariant
2392 // (though not a constant). In this case the whole vector can be
2393 // hoisted out.
2394 // FIXME: need to teach the cost model about this case for better
2395 // estimation.
2396 (IsInvariant && !isa<Constant>(Data.V) &&
2397 !getSameOpcode({Op, Data.V}, TLI).getOpcode() &&
2398 L->isLoopInvariant(Data.V))) {
2399 FoundCandidate = true;
2400 Data.IsUsed = Data.V == Op;
2401 if (Data.V == Op)
2402 ++Cnt;
2403 break;
2404 }
2405 }
2406 if (!FoundCandidate)
2407 return false;
2408 }
2409 return getNumLanes() == 2 || Cnt > 1;
2410 }
2411
2412 /// Checks if there is at least single compatible operand in lanes other
2413 /// than \p Lane, compatible with the operand \p Op.
2414 bool canBeVectorized(Instruction *Op, unsigned OpIdx, unsigned Lane) const {
2415 bool OpAPO = getData(OpIdx, Lane).APO;
2416 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2417 if (Ln == Lane)
2418 continue;
2419 if (any_of(seq<unsigned>(getNumOperands()), [&](unsigned OpI) {
2420 const OperandData &Data = getData(OpI, Ln);
2421 if (Data.APO != OpAPO || Data.IsUsed)
2422 return true;
2423 Value *OpILn = getValue(OpI, Ln);
2424 return (L && L->isLoopInvariant(OpILn)) ||
2425 (getSameOpcode({Op, OpILn}, TLI).getOpcode() &&
2426 Op->getParent() == cast<Instruction>(OpILn)->getParent());
2427 }))
2428 return true;
2429 }
2430 return false;
2431 }
2432
2433 public:
2434 /// Initialize with all the operands of the instruction vector \p RootVL.
2436 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
2437 L(R.LI->getLoopFor(
2438 (cast<Instruction>(RootVL.front())->getParent()))) {
2439 // Append all the operands of RootVL.
2440 appendOperandsOfVL(RootVL);
2441 }
2442
2443 /// \Returns a value vector with the operands across all lanes for the
2444 /// opearnd at \p OpIdx.
2445 ValueList getVL(unsigned OpIdx) const {
2446 ValueList OpVL(OpsVec[OpIdx].size());
2447 assert(OpsVec[OpIdx].size() == getNumLanes() &&
2448 "Expected same num of lanes across all operands");
2449 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2450 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2451 return OpVL;
2452 }
2453
2454 // Performs operand reordering for 2 or more operands.
2455 // The original operands are in OrigOps[OpIdx][Lane].
2456 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
2457 void reorder() {
2458 unsigned NumOperands = getNumOperands();
2459 unsigned NumLanes = getNumLanes();
2460 // Each operand has its own mode. We are using this mode to help us select
2461 // the instructions for each lane, so that they match best with the ones
2462 // we have selected so far.
2463 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
2464
2465 // This is a greedy single-pass algorithm. We are going over each lane
2466 // once and deciding on the best order right away with no back-tracking.
2467 // However, in order to increase its effectiveness, we start with the lane
2468 // that has operands that can move the least. For example, given the
2469 // following lanes:
2470 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
2471 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
2472 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
2473 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
2474 // we will start at Lane 1, since the operands of the subtraction cannot
2475 // be reordered. Then we will visit the rest of the lanes in a circular
2476 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
2477
2478 // Find the first lane that we will start our search from.
2479 unsigned FirstLane = getBestLaneToStartReordering();
2480
2481 // Initialize the modes.
2482 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2483 Value *OpLane0 = getValue(OpIdx, FirstLane);
2484 // Keep track if we have instructions with all the same opcode on one
2485 // side.
2486 if (isa<LoadInst>(OpLane0))
2487 ReorderingModes[OpIdx] = ReorderingMode::Load;
2488 else if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
2489 // Check if OpLane0 should be broadcast.
2490 if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
2491 !canBeVectorized(OpILane0, OpIdx, FirstLane))
2492 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2493 else
2494 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2495 } else if (isa<Constant>(OpLane0))
2496 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2497 else if (isa<Argument>(OpLane0))
2498 // Our best hope is a Splat. It may save some cost in some cases.
2499 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2500 else
2501 // NOTE: This should be unreachable.
2502 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2503 }
2504
2505 // Check that we don't have same operands. No need to reorder if operands
2506 // are just perfect diamond or shuffled diamond match. Do not do it only
2507 // for possible broadcasts or non-power of 2 number of scalars (just for
2508 // now).
2509 auto &&SkipReordering = [this]() {
2510 SmallPtrSet<Value *, 4> UniqueValues;
2511 ArrayRef<OperandData> Op0 = OpsVec.front();
2512 for (const OperandData &Data : Op0)
2513 UniqueValues.insert(Data.V);
2514 for (ArrayRef<OperandData> Op : drop_begin(OpsVec, 1)) {
2515 if (any_of(Op, [&UniqueValues](const OperandData &Data) {
2516 return !UniqueValues.contains(Data.V);
2517 }))
2518 return false;
2519 }
2520 // TODO: Check if we can remove a check for non-power-2 number of
2521 // scalars after full support of non-power-2 vectorization.
2522 return UniqueValues.size() != 2 && has_single_bit(UniqueValues.size());
2523 };
2524
2525 // If the initial strategy fails for any of the operand indexes, then we
2526 // perform reordering again in a second pass. This helps avoid assigning
2527 // high priority to the failed strategy, and should improve reordering for
2528 // the non-failed operand indexes.
2529 for (int Pass = 0; Pass != 2; ++Pass) {
2530 // Check if no need to reorder operands since they're are perfect or
2531 // shuffled diamond match.
2532 // Need to do it to avoid extra external use cost counting for
2533 // shuffled matches, which may cause regressions.
2534 if (SkipReordering())
2535 break;
2536 // Skip the second pass if the first pass did not fail.
2537 bool StrategyFailed = false;
2538 // Mark all operand data as free to use.
2539 clearUsed();
2540 // We keep the original operand order for the FirstLane, so reorder the
2541 // rest of the lanes. We are visiting the nodes in a circular fashion,
2542 // using FirstLane as the center point and increasing the radius
2543 // distance.
2544 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
2545 for (unsigned I = 0; I < NumOperands; ++I)
2546 MainAltOps[I].push_back(getData(I, FirstLane).V);
2547
2548 SmallBitVector UsedLanes(NumLanes);
2549 UsedLanes.set(FirstLane);
2550 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2551 // Visit the lane on the right and then the lane on the left.
2552 for (int Direction : {+1, -1}) {
2553 int Lane = FirstLane + Direction * Distance;
2554 if (Lane < 0 || Lane >= (int)NumLanes)
2555 continue;
2556 UsedLanes.set(Lane);
2557 int LastLane = Lane - Direction;
2558 assert(LastLane >= 0 && LastLane < (int)NumLanes &&
2559 "Out of bounds");
2560 // Look for a good match for each operand.
2561 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2562 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
2563 std::optional<unsigned> BestIdx =
2564 getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,
2565 MainAltOps[OpIdx], UsedLanes);
2566 // By not selecting a value, we allow the operands that follow to
2567 // select a better matching value. We will get a non-null value in
2568 // the next run of getBestOperand().
2569 if (BestIdx) {
2570 // Swap the current operand with the one returned by
2571 // getBestOperand().
2572 swap(OpIdx, *BestIdx, Lane);
2573 } else {
2574 // Enable the second pass.
2575 StrategyFailed = true;
2576 }
2577 // Try to get the alternate opcode and follow it during analysis.
2578 if (MainAltOps[OpIdx].size() != 2) {
2579 OperandData &AltOp = getData(OpIdx, Lane);
2580 InstructionsState OpS =
2581 getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
2582 if (OpS.getOpcode() && OpS.isAltShuffle())
2583 MainAltOps[OpIdx].push_back(AltOp.V);
2584 }
2585 }
2586 }
2587 }
2588 // Skip second pass if the strategy did not fail.
2589 if (!StrategyFailed)
2590 break;
2591 }
2592 }
2593
2594#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2595 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
2596 switch (RMode) {
2597 case ReorderingMode::Load:
2598 return "Load";
2599 case ReorderingMode::Opcode:
2600 return "Opcode";
2601 case ReorderingMode::Constant:
2602 return "Constant";
2603 case ReorderingMode::Splat:
2604 return "Splat";
2605 case ReorderingMode::Failed:
2606 return "Failed";
2607 }
2608 llvm_unreachable("Unimplemented Reordering Type");
2609 }
2610
2611 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
2612 raw_ostream &OS) {
2613 return OS << getModeStr(RMode);
2614 }
2615
2616 /// Debug print.
2617 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
2618 printMode(RMode, dbgs());
2619 }
2620
2621 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
2622 return printMode(RMode, OS);
2623 }
2624
2626 const unsigned Indent = 2;
2627 unsigned Cnt = 0;
2628 for (const OperandDataVec &OpDataVec : OpsVec) {
2629 OS << "Operand " << Cnt++ << "\n";
2630 for (const OperandData &OpData : OpDataVec) {
2631 OS.indent(Indent) << "{";
2632 if (Value *V = OpData.V)
2633 OS << *V;
2634 else
2635 OS << "null";
2636 OS << ", APO:" << OpData.APO << "}\n";
2637 }
2638 OS << "\n";
2639 }
2640 return OS;
2641 }
2642
2643 /// Debug print.
2644 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
2645#endif
2646 };
2647
2648 /// Evaluate each pair in \p Candidates and return index into \p Candidates
2649 /// for a pair which have highest score deemed to have best chance to form
2650 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
2651 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
2652 /// of the cost, considered to be good enough score.
2653 std::optional<int>
2654 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
2655 int Limit = LookAheadHeuristics::ScoreFail) const {
2656 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
2658 int BestScore = Limit;
2659 std::optional<int> Index;
2660 for (int I : seq<int>(0, Candidates.size())) {
2661 int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
2662 Candidates[I].second,
2663 /*U1=*/nullptr, /*U2=*/nullptr,
2664 /*CurrLevel=*/1, {});
2665 if (Score > BestScore) {
2666 BestScore = Score;
2667 Index = I;
2668 }
2669 }
2670 return Index;
2671 }
2672
2673 /// Checks if the instruction is marked for deletion.
2674 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
2675
2676 /// Removes an instruction from its block and eventually deletes it.
2677 /// It's like Instruction::eraseFromParent() except that the actual deletion
2678 /// is delayed until BoUpSLP is destructed.
2680 DeletedInstructions.insert(I);
2681 }
2682
2683 /// Remove instructions from the parent function and clear the operands of \p
2684 /// DeadVals instructions, marking for deletion trivially dead operands.
2685 template <typename T>
2688 for (T *V : DeadVals) {
2689 auto *I = cast<Instruction>(V);
2690 DeletedInstructions.insert(I);
2691 }
2692 DenseSet<Value *> Processed;
2693 for (T *V : DeadVals) {
2694 if (!V || !Processed.insert(V).second)
2695 continue;
2696 auto *I = cast<Instruction>(V);
2699 if (const TreeEntry *Entry = getTreeEntry(I)) {
2700 Entries.push_back(Entry);
2701 auto It = MultiNodeScalars.find(I);
2702 if (It != MultiNodeScalars.end())
2703 Entries.append(It->second.begin(), It->second.end());
2704 }
2705 for (Use &U : I->operands()) {
2706 if (auto *OpI = dyn_cast_if_present<Instruction>(U.get());
2707 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
2709 (Entries.empty() || none_of(Entries, [&](const TreeEntry *Entry) {
2710 return Entry->VectorizedValue == OpI;
2711 })))
2712 DeadInsts.push_back(OpI);
2713 }
2714 I->dropAllReferences();
2715 }
2716 for (T *V : DeadVals) {
2717 auto *I = cast<Instruction>(V);
2718 if (!I->getParent())
2719 continue;
2720 assert((I->use_empty() || all_of(I->uses(),
2721 [&](Use &U) {
2722 return isDeleted(
2723 cast<Instruction>(U.getUser()));
2724 })) &&
2725 "trying to erase instruction with users.");
2726 I->removeFromParent();
2727 SE->forgetValue(I);
2728 }
2729 // Process the dead instruction list until empty.
2730 while (!DeadInsts.empty()) {
2731 Value *V = DeadInsts.pop_back_val();
2733 if (!VI || !VI->getParent())
2734 continue;
2736 "Live instruction found in dead worklist!");
2737 assert(VI->use_empty() && "Instructions with uses are not dead.");
2738
2739 // Don't lose the debug info while deleting the instructions.
2740 salvageDebugInfo(*VI);
2741
2742 // Null out all of the instruction's operands to see if any operand
2743 // becomes dead as we go.
2744 for (Use &OpU : VI->operands()) {
2745 Value *OpV = OpU.get();
2746 if (!OpV)
2747 continue;
2748 OpU.set(nullptr);
2749
2750 if (!OpV->use_empty())
2751 continue;
2752
2753 // If the operand is an instruction that became dead as we nulled out
2754 // the operand, and if it is 'trivially' dead, delete it in a future
2755 // loop iteration.
2756 if (auto *OpI = dyn_cast<Instruction>(OpV))
2757 if (!DeletedInstructions.contains(OpI) &&
2759 DeadInsts.push_back(OpI);
2760 }
2761
2762 VI->removeFromParent();
2763 DeletedInstructions.insert(VI);
2764 SE->forgetValue(VI);
2765 }
2766 }
2767
2768 /// Checks if the instruction was already analyzed for being possible
2769 /// reduction root.
2771 return AnalyzedReductionsRoots.count(I);
2772 }
2773 /// Register given instruction as already analyzed for being possible
2774 /// reduction root.
2776 AnalyzedReductionsRoots.insert(I);
2777 }
2778 /// Checks if the provided list of reduced values was checked already for
2779 /// vectorization.
2781 return AnalyzedReductionVals.contains(hash_value(VL));
2782 }
2783 /// Adds the list of reduced values to list of already checked values for the
2784 /// vectorization.
2786 AnalyzedReductionVals.insert(hash_value(VL));
2787 }
2788 /// Clear the list of the analyzed reduction root instructions.
2790 AnalyzedReductionsRoots.clear();
2791 AnalyzedReductionVals.clear();
2792 AnalyzedMinBWVals.clear();
2793 }
2794 /// Checks if the given value is gathered in one of the nodes.
2795 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
2796 return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
2797 }
2798 /// Checks if the given value is gathered in one of the nodes.
2799 bool isGathered(const Value *V) const {
2800 return MustGather.contains(V);
2801 }
2802 /// Checks if the specified value was not schedule.
2803 bool isNotScheduled(const Value *V) const {
2804 return NonScheduledFirst.contains(V);
2805 }
2806
2807 /// Check if the value is vectorized in the tree.
2808 bool isVectorized(Value *V) const { return getTreeEntry(V); }
2809
2810 ~BoUpSLP();
2811
2812private:
2813 /// Determine if a node \p E in can be demoted to a smaller type with a
2814 /// truncation. We collect the entries that will be demoted in ToDemote.
2815 /// \param E Node for analysis
2816 /// \param ToDemote indices of the nodes to be demoted.
2817 bool collectValuesToDemote(const TreeEntry &E, bool IsProfitableToDemoteRoot,
2818 unsigned &BitWidth,
2819 SmallVectorImpl<unsigned> &ToDemote,
2821 unsigned &MaxDepthLevel,
2822 bool &IsProfitableToDemote,
2823 bool IsTruncRoot) const;
2824
2825 /// Check if the operands on the edges \p Edges of the \p UserTE allows
2826 /// reordering (i.e. the operands can be reordered because they have only one
2827 /// user and reordarable).
2828 /// \param ReorderableGathers List of all gather nodes that require reordering
2829 /// (e.g., gather of extractlements or partially vectorizable loads).
2830 /// \param GatherOps List of gather operand nodes for \p UserTE that require
2831 /// reordering, subset of \p NonVectorized.
2832 bool
2833 canReorderOperands(TreeEntry *UserTE,
2834 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
2835 ArrayRef<TreeEntry *> ReorderableGathers,
2836 SmallVectorImpl<TreeEntry *> &GatherOps);
2837
2838 /// Checks if the given \p TE is a gather node with clustered reused scalars
2839 /// and reorders it per given \p Mask.
2840 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
2841
2842 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2843 /// if any. If it is not vectorized (gather node), returns nullptr.
2844 TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) {
2845 ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);
2846 TreeEntry *TE = nullptr;
2847 const auto *It = find_if(VL, [&](Value *V) {
2848 TE = getTreeEntry(V);
2849 if (TE && is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
2850 return true;
2851 auto It = MultiNodeScalars.find(V);
2852 if (It != MultiNodeScalars.end()) {
2853 for (TreeEntry *E : It->second) {
2854 if (is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
2855 TE = E;
2856 return true;
2857 }
2858 }
2859 }
2860 return false;
2861 });
2862 if (It != VL.end()) {
2863 assert(TE->isSame(VL) && "Expected same scalars.");
2864 return TE;
2865 }
2866 return nullptr;
2867 }
2868
2869 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2870 /// if any. If it is not vectorized (gather node), returns nullptr.
2871 const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE,
2872 unsigned OpIdx) const {
2873 return const_cast<BoUpSLP *>(this)->getVectorizedOperand(
2874 const_cast<TreeEntry *>(UserTE), OpIdx);
2875 }
2876
2877 /// Checks if all users of \p I are the part of the vectorization tree.
2878 bool areAllUsersVectorized(
2879 Instruction *I,
2880 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
2881
2882 /// Return information about the vector formed for the specified index
2883 /// of a vector of (the same) instruction.
2885
2886 /// \ returns the graph entry for the \p Idx operand of the \p E entry.
2887 const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
2888
2889 /// Gets the root instruction for the given node. If the node is a strided
2890 /// load/store node with the reverse order, the root instruction is the last
2891 /// one.
2892 Instruction *getRootEntryInstruction(const TreeEntry &Entry) const;
2893
2894 /// \returns Cast context for the given graph node.
2896 getCastContextHint(const TreeEntry &TE) const;
2897
2898 /// \returns the cost of the vectorizable entry.
2899 InstructionCost getEntryCost(const TreeEntry *E,
2900 ArrayRef<Value *> VectorizedVals,
2901 SmallPtrSetImpl<Value *> &CheckedExtracts);
2902
2903 /// This is the recursive part of buildTree.
2904 void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
2905 const EdgeInfo &EI);
2906
2907 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
2908 /// be vectorized to use the original vector (or aggregate "bitcast" to a
2909 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
2910 /// returns false, setting \p CurrentOrder to either an empty vector or a
2911 /// non-identity permutation that allows to reuse extract instructions.
2912 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
2913 /// extract order.
2914 bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
2915 SmallVectorImpl<unsigned> &CurrentOrder,
2916 bool ResizeAllowed = false) const;
2917
2918 /// Vectorize a single entry in the tree.
2919 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
2920 /// avoid issues with def-use order.
2921 Value *vectorizeTree(TreeEntry *E, bool PostponedPHIs);
2922
2923 /// Returns vectorized operand node, that matches the order of the scalars
2924 /// operand number \p NodeIdx in entry \p E.
2925 TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E, unsigned NodeIdx);
2926 const TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E,
2927 unsigned NodeIdx) const {
2928 return const_cast<BoUpSLP *>(this)->getMatchedVectorizedOperand(E, NodeIdx);
2929 }
2930
2931 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
2932 /// \p E.
2933 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
2934 /// avoid issues with def-use order.
2935 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx, bool PostponedPHIs);
2936
2937 /// Create a new vector from a list of scalar values. Produces a sequence
2938 /// which exploits values reused across lanes, and arranges the inserts
2939 /// for ease of later optimization.
2940 template <typename BVTy, typename ResTy, typename... Args>
2941 ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params);
2942
2943 /// Create a new vector from a list of scalar values. Produces a sequence
2944 /// which exploits values reused across lanes, and arranges the inserts
2945 /// for ease of later optimization.
2946 Value *createBuildVector(const TreeEntry *E, Type *ScalarTy,
2947 bool PostponedPHIs);
2948
2949 /// Returns the instruction in the bundle, which can be used as a base point
2950 /// for scheduling. Usually it is the last instruction in the bundle, except
2951 /// for the case when all operands are external (in this case, it is the first
2952 /// instruction in the list).
2953 Instruction &getLastInstructionInBundle(const TreeEntry *E);
2954
2955 /// Tries to find extractelement instructions with constant indices from fixed
2956 /// vector type and gather such instructions into a bunch, which highly likely
2957 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
2958 /// was successful, the matched scalars are replaced by poison values in \p VL
2959 /// for future analysis.
2960 std::optional<TargetTransformInfo::ShuffleKind>
2961 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
2962 SmallVectorImpl<int> &Mask) const;
2963
2964 /// Tries to find extractelement instructions with constant indices from fixed
2965 /// vector type and gather such instructions into a bunch, which highly likely
2966 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
2967 /// was successful, the matched scalars are replaced by poison values in \p VL
2968 /// for future analysis.
2970 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
2972 unsigned NumParts) const;
2973
2974 /// Checks if the gathered \p VL can be represented as a single register
2975 /// shuffle(s) of previous tree entries.
2976 /// \param TE Tree entry checked for permutation.
2977 /// \param VL List of scalars (a subset of the TE scalar), checked for
2978 /// permutations. Must form single-register vector.
2979 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
2980 /// commands to build the mask using the original vector value, without
2981 /// relying on the potential reordering.
2982 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
2983 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
2984 std::optional<TargetTransformInfo::ShuffleKind>
2985 isGatherShuffledSingleRegisterEntry(
2986 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
2987 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
2988 bool ForOrder);
2989
2990 /// Checks if the gathered \p VL can be represented as multi-register
2991 /// shuffle(s) of previous tree entries.
2992 /// \param TE Tree entry checked for permutation.
2993 /// \param VL List of scalars (a subset of the TE scalar), checked for
2994 /// permutations.
2995 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
2996 /// commands to build the mask using the original vector value, without
2997 /// relying on the potential reordering.
2998 /// \returns per-register series of ShuffleKind, if gathered values can be
2999 /// represented as shuffles of previous tree entries. \p Mask is filled with
3000 /// the shuffle mask (also on per-register base).
3002 isGatherShuffledEntry(
3003 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
3005 unsigned NumParts, bool ForOrder = false);
3006
3007 /// \returns the scalarization cost for this list of values. Assuming that
3008 /// this subtree gets vectorized, we may need to extract the values from the
3009 /// roots. This method calculates the cost of extracting the values.
3010 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
3011 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
3012 Type *ScalarTy) const;
3013
3014 /// Set the Builder insert point to one after the last instruction in
3015 /// the bundle
3016 void setInsertPointAfterBundle(const TreeEntry *E);
3017
3018 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
3019 /// specified, the starting vector value is poison.
3020 Value *gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy);
3021
3022 /// \returns whether the VectorizableTree is fully vectorizable and will
3023 /// be beneficial even the tree height is tiny.
3024 bool isFullyVectorizableTinyTree(bool ForReduction) const;
3025
3026 /// Run through the list of all gathered loads in the graph and try to find
3027 /// vector loads/masked gathers instead of regular gathers. Later these loads
3028 /// are reshufled to build final gathered nodes.
3029 void tryToVectorizeGatheredLoads(
3030 ArrayRef<SmallVector<std::pair<LoadInst *, int>>> GatheredLoads);
3031
3032 /// Reorder commutative or alt operands to get better probability of
3033 /// generating vectorized code.
3034 static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
3037 const BoUpSLP &R);
3038
3039 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
3040 /// users of \p TE and collects the stores. It returns the map from the store
3041 /// pointers to the collected stores.
3043 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
3044
3045 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
3046 /// stores in \p StoresVec can form a vector instruction. If so it returns
3047 /// true and populates \p ReorderIndices with the shuffle indices of the
3048 /// stores when compared to the sorted vector.
3049 bool canFormVector(ArrayRef<StoreInst *> StoresVec,
3050 OrdersType &ReorderIndices) const;
3051
3052 /// Iterates through the users of \p TE, looking for scalar stores that can be
3053 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
3054 /// their order and builds an order index vector for each store bundle. It
3055 /// returns all these order vectors found.
3056 /// We run this after the tree has formed, otherwise we may come across user
3057 /// instructions that are not yet in the tree.
3059 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
3060
3061 struct TreeEntry {
3062 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
3063 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3064
3065 /// \returns Common mask for reorder indices and reused scalars.
3066 SmallVector<int> getCommonMask() const {
3068 inversePermutation(ReorderIndices, Mask);
3069 ::addMask(Mask, ReuseShuffleIndices);
3070 return Mask;
3071 }
3072
3073 /// \returns true if the scalars in VL are equal to this entry.
3074 bool isSame(ArrayRef<Value *> VL) const {
3075 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
3076 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
3077 return std::equal(VL.begin(), VL.end(), Scalars.begin());
3078 return VL.size() == Mask.size() &&
3079 std::equal(VL.begin(), VL.end(), Mask.begin(),
3080 [Scalars](Value *V, int Idx) {
3081 return (isa<UndefValue>(V) &&
3082 Idx == PoisonMaskElem) ||
3083 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3084 });
3085 };
3086 if (!ReorderIndices.empty()) {
3087 // TODO: implement matching if the nodes are just reordered, still can
3088 // treat the vector as the same if the list of scalars matches VL
3089 // directly, without reordering.
3091 inversePermutation(ReorderIndices, Mask);
3092 if (VL.size() == Scalars.size())
3093 return IsSame(Scalars, Mask);
3094 if (VL.size() == ReuseShuffleIndices.size()) {
3095 ::addMask(Mask, ReuseShuffleIndices);
3096 return IsSame(Scalars, Mask);
3097 }
3098 return false;
3099 }
3100 return IsSame(Scalars, ReuseShuffleIndices);
3101 }
3102
3103 bool isOperandGatherNode(const EdgeInfo &UserEI) const {
3104 return isGather() && !UserTreeIndices.empty() &&
3105 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
3106 UserTreeIndices.front().UserTE == UserEI.UserTE;
3107 }
3108
3109 /// \returns true if current entry has same operands as \p TE.
3110 bool hasEqualOperands(const TreeEntry &TE) const {
3111 if (TE.getNumOperands() != getNumOperands())
3112 return false;
3113 SmallBitVector Used(getNumOperands());
3114 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
3115 unsigned PrevCount = Used.count();
3116 for (unsigned K = 0; K < E; ++K) {
3117 if (Used.test(K))
3118 continue;
3119 if (getOperand(K) == TE.getOperand(I)) {
3120 Used.set(K);
3121 break;
3122 }
3123 }
3124 // Check if we actually found the matching operand.
3125 if (PrevCount == Used.count())
3126 return false;
3127 }
3128 return true;
3129 }
3130
3131 /// \return Final vectorization factor for the node. Defined by the total
3132 /// number of vectorized scalars, including those, used several times in the
3133 /// entry and counted in the \a ReuseShuffleIndices, if any.
3134 unsigned getVectorFactor() const {
3135 if (!ReuseShuffleIndices.empty())
3136 return ReuseShuffleIndices.size();
3137 return Scalars.size();
3138 };
3139
3140 /// Checks if the current node is a gather node.
3141 bool isGather() const {return State == NeedToGather; }
3142
3143 /// A vector of scalars.
3144 ValueList Scalars;
3145
3146 /// The Scalars are vectorized into this value. It is initialized to Null.
3147 WeakTrackingVH VectorizedValue = nullptr;
3148
3149 /// New vector phi instructions emitted for the vectorized phi nodes.
3150 PHINode *PHI = nullptr;
3151
3152 /// Do we need to gather this sequence or vectorize it
3153 /// (either with vector instruction or with scatter/gather
3154 /// intrinsics for store/load)?
3155 enum EntryState {
3156 Vectorize, ///< The node is regularly vectorized.
3157 ScatterVectorize, ///< Masked scatter/gather node.
3158 StridedVectorize, ///< Strided loads (and stores)
3159 NeedToGather, ///< Gather/buildvector node.
3160 CombinedVectorize, ///< Vectorized node, combined with its user into more
3161 ///< complex node like select/cmp to minmax, mul/add to
3162 ///< fma, etc. Must be used for the following nodes in
3163 ///< the pattern, not the very first one.
3164 };
3165 EntryState State;
3166
3167 /// List of combined opcodes supported by the vectorizer.
3168 enum CombinedOpcode {
3169 NotCombinedOp = -1,
3170 MinMax = Instruction::OtherOpsEnd + 1,
3171 };
3172 CombinedOpcode CombinedOp = NotCombinedOp;
3173
3174 /// Does this sequence require some shuffling?
3175 SmallVector<int, 4> ReuseShuffleIndices;
3176
3177 /// Does this entry require reordering?
3178 SmallVector<unsigned, 4> ReorderIndices;
3179
3180 /// Points back to the VectorizableTree.
3181 ///
3182 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
3183 /// to be a pointer and needs to be able to initialize the child iterator.
3184 /// Thus we need a reference back to the container to translate the indices
3185 /// to entries.
3186 VecTreeTy &Container;
3187
3188 /// The TreeEntry index containing the user of this entry. We can actually
3189 /// have multiple users so the data structure is not truly a tree.
3190 SmallVector<EdgeInfo, 1> UserTreeIndices;
3191
3192 /// The index of this treeEntry in VectorizableTree.
3193 int Idx = -1;
3194
3195 /// For gather/buildvector/alt opcode (TODO) nodes, which are combined from
3196 /// other nodes as a series of insertvector instructions.
3197 SmallVector<std::pair<unsigned, unsigned>, 0> CombinedEntriesWithIndices;
3198
3199 private:
3200 /// The operands of each instruction in each lane Operands[op_index][lane].
3201 /// Note: This helps avoid the replication of the code that performs the
3202 /// reordering of operands during buildTree_rec() and vectorizeTree().
3204
3205 /// The main/alternate instruction.
3206 Instruction *MainOp = nullptr;
3207 Instruction *AltOp = nullptr;
3208
3209 public:
3210 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
3211 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
3212 if (Operands.size() < OpIdx + 1)
3213 Operands.resize(OpIdx + 1);
3214 assert(Operands[OpIdx].empty() && "Already resized?");
3215 assert(OpVL.size() <= Scalars.size() &&
3216 "Number of operands is greater than the number of scalars.");
3217 Operands[OpIdx].resize(OpVL.size());
3218 copy(OpVL, Operands[OpIdx].begin());
3219 }
3220
3221 /// Set the operands of this bundle in their original order.
3222 void setOperandsInOrder() {
3223 assert(Operands.empty() && "Already initialized?");
3224 auto *I0 = cast<Instruction>(Scalars[0]);
3225 Operands.resize(I0->getNumOperands());
3226 unsigned NumLanes = Scalars.size();
3227 for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
3228 OpIdx != NumOperands; ++OpIdx) {
3229 Operands[OpIdx].resize(NumLanes);
3230 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
3231 auto *I = cast<Instruction>(Scalars[Lane]);
3232 assert(I->getNumOperands() == NumOperands &&
3233 "Expected same number of operands");
3234 Operands[OpIdx][Lane] = I->getOperand(OpIdx);
3235 }
3236 }
3237 }
3238
3239 /// Reorders operands of the node to the given mask \p Mask.
3240 void reorderOperands(ArrayRef<int> Mask) {
3241 for (ValueList &Operand : Operands)
3242 reorderScalars(Operand, Mask);
3243 }
3244
3245 /// \returns the \p OpIdx operand of this TreeEntry.
3246 ValueList &getOperand(unsigned OpIdx) {
3247 assert(OpIdx < Operands.size() && "Off bounds");
3248 return Operands[OpIdx];
3249 }
3250
3251 /// \returns the \p OpIdx operand of this TreeEntry.
3252 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
3253 assert(OpIdx < Operands.size() && "Off bounds");
3254 return Operands[OpIdx];
3255 }
3256
3257 /// \returns the number of operands.
3258 unsigned getNumOperands() const { return Operands.size(); }
3259
3260 /// \return the single \p OpIdx operand.
3261 Value *getSingleOperand(unsigned OpIdx) const {
3262 assert(OpIdx < Operands.size() && "Off bounds");
3263 assert(!Operands[OpIdx].empty() && "No operand available");
3264 return Operands[OpIdx][0];
3265 }
3266
3267 /// Some of the instructions in the list have alternate opcodes.
3268 bool isAltShuffle() const { return MainOp != AltOp; }
3269
3270 bool isOpcodeOrAlt(Instruction *I) const {
3271 unsigned CheckedOpcode = I->getOpcode();
3272 return (getOpcode() == CheckedOpcode ||
3273 getAltOpcode() == CheckedOpcode);
3274 }
3275
3276 /// Chooses the correct key for scheduling data. If \p Op has the same (or
3277 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
3278 /// \p OpValue.
3279 Value *isOneOf(Value *Op) const {
3280 auto *I = dyn_cast<Instruction>(Op);
3281 if (I && isOpcodeOrAlt(I))
3282 return Op;
3283 return MainOp;
3284 }
3285
3286 void setOperations(const InstructionsState &S) {
3287 MainOp = S.MainOp;
3288 AltOp = S.AltOp;
3289 }
3290
3291 Instruction *getMainOp() const {
3292 return MainOp;
3293 }
3294
3295 Instruction *getAltOp() const {
3296 return AltOp;
3297 }
3298
3299 /// The main/alternate opcodes for the list of instructions.
3300 unsigned getOpcode() const {
3301 return MainOp ? MainOp->getOpcode() : 0;
3302 }
3303
3304 unsigned getAltOpcode() const {
3305 return AltOp ? AltOp->getOpcode() : 0;
3306 }
3307
3308 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
3309 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
3310 int findLaneForValue(Value *V) const {
3311 unsigned FoundLane = getVectorFactor();
3312 for (auto *It = find(Scalars, V), *End = Scalars.end(); It != End;
3313 std::advance(It, 1)) {
3314 if (*It != V)
3315 continue;
3316 FoundLane = std::distance(Scalars.begin(), It);
3317 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
3318 if (!ReorderIndices.empty())
3319 FoundLane = ReorderIndices[FoundLane];
3320 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
3321 if (ReuseShuffleIndices.empty())
3322 break;
3323 if (auto *RIt = find(ReuseShuffleIndices, FoundLane);
3324 RIt != ReuseShuffleIndices.end()) {
3325 FoundLane = std::distance(ReuseShuffleIndices.begin(), RIt);
3326 break;
3327 }
3328 }
3329 assert(FoundLane < getVectorFactor() && "Unable to find given value.");
3330 return FoundLane;
3331 }
3332
3333 /// Build a shuffle mask for graph entry which represents a merge of main
3334 /// and alternate operations.
3335 void
3336 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
3338 SmallVectorImpl<Value *> *OpScalars = nullptr,
3339 SmallVectorImpl<Value *> *AltScalars = nullptr) const;
3340
3341 /// Return true if this is a non-power-of-2 node.
3342 bool isNonPowOf2Vec() const {
3343 bool IsNonPowerOf2 = !has_single_bit(Scalars.size());
3344 return IsNonPowerOf2;
3345 }
3346
3347 /// Return true if this is a node, which tries to vectorize number of
3348 /// elements, forming whole vectors.
3349 bool
3350 hasNonWholeRegisterOrNonPowerOf2Vec(const TargetTransformInfo &TTI) const {
3351 bool IsNonPowerOf2 = !hasFullVectorsOrPowerOf2(
3352 TTI, getValueType(Scalars.front()), Scalars.size());
3353 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
3354 "Reshuffling not supported with non-power-of-2 vectors yet.");
3355 return IsNonPowerOf2;
3356 }
3357
3358#ifndef NDEBUG
3359 /// Debug printer.
3360 LLVM_DUMP_METHOD void dump() const {
3361 dbgs() << Idx << ".\n";
3362 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
3363 dbgs() << "Operand " << OpI << ":\n";
3364 for (const Value *V : Operands[OpI])
3365 dbgs().indent(2) << *V << "\n";
3366 }
3367 dbgs() << "Scalars: \n";
3368 for (Value *V : Scalars)
3369 dbgs().indent(2) << *V << "\n";
3370 dbgs() << "State: ";
3371 switch (State) {
3372 case Vectorize:
3373 dbgs() << "Vectorize\n";
3374 break;
3375 case ScatterVectorize:
3376 dbgs() << "ScatterVectorize\n";
3377 break;
3378 case StridedVectorize:
3379 dbgs() << "StridedVectorize\n";
3380 break;
3381 case NeedToGather:
3382 dbgs() << "NeedToGather\n";
3383 break;
3384 case CombinedVectorize:
3385 dbgs() << "CombinedVectorize\n";
3386 break;
3387 }
3388 dbgs() << "MainOp: ";
3389 if (MainOp)
3390 dbgs() << *MainOp << "\n";
3391 else
3392 dbgs() << "NULL\n";
3393 dbgs() << "AltOp: ";
3394 if (AltOp)
3395 dbgs() << *AltOp << "\n";
3396 else
3397 dbgs() << "NULL\n";
3398 dbgs() << "VectorizedValue: ";
3399 if (VectorizedValue)
3400 dbgs() << *VectorizedValue << "\n";
3401 else
3402 dbgs() << "NULL\n";
3403 dbgs() << "ReuseShuffleIndices: ";
3404 if (ReuseShuffleIndices.empty())
3405 dbgs() << "Empty";
3406 else
3407 for (int ReuseIdx : ReuseShuffleIndices)
3408 dbgs() << ReuseIdx << ", ";
3409 dbgs() << "\n";
3410 dbgs() << "ReorderIndices: ";
3411 for (unsigned ReorderIdx : ReorderIndices)
3412 dbgs() << ReorderIdx << ", ";
3413 dbgs() << "\n";
3414 dbgs() << "UserTreeIndices: ";
3415 for (const auto &EInfo : UserTreeIndices)
3416 dbgs() << EInfo << ", ";
3417 dbgs() << "\n";
3418 }
3419#endif
3420 };
3421
3422#ifndef NDEBUG
3423 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
3424 InstructionCost VecCost, InstructionCost ScalarCost,
3425 StringRef Banner) const {
3426 dbgs() << "SLP: " << Banner << ":\n";
3427 E->dump();
3428 dbgs() << "SLP: Costs:\n";
3429 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
3430 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
3431 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
3432 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
3433 << ReuseShuffleCost + VecCost - ScalarCost << "\n";
3434 }
3435#endif
3436
3437 /// Create a new VectorizableTree entry.
3438 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
3439 std::optional<ScheduleData *> Bundle,
3440 const InstructionsState &S,
3441 const EdgeInfo &UserTreeIdx,
3442 ArrayRef<int> ReuseShuffleIndices = {},
3443 ArrayRef<unsigned> ReorderIndices = {}) {
3444 TreeEntry::EntryState EntryState =
3445 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
3446 return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
3447 ReuseShuffleIndices, ReorderIndices);
3448 }
3449
3450 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
3451 TreeEntry::EntryState EntryState,
3452 std::optional<ScheduleData *> Bundle,
3453 const InstructionsState &S,
3454 const EdgeInfo &UserTreeIdx,
3455 ArrayRef<int> ReuseShuffleIndices = {},
3456 ArrayRef<unsigned> ReorderIndices = {}) {
3457 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
3458 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
3459 "Need to vectorize gather entry?");
3460 // Gathered loads still gathered? Do not create entry, use the original one.
3461 if (GatheredLoadsEntriesFirst != NoGatheredLoads &&
3462 EntryState == TreeEntry::NeedToGather &&
3463 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
3464 !UserTreeIdx.UserTE)
3465 return nullptr;
3466 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
3467 TreeEntry *Last = VectorizableTree.back().get();
3468 Last->Idx = VectorizableTree.size() - 1;
3469 Last->State = EntryState;
3470 // FIXME: Remove once support for ReuseShuffleIndices has been implemented
3471 // for non-power-of-two vectors.
3472 assert(
3474 ReuseShuffleIndices.empty()) &&
3475 "Reshuffling scalars not yet supported for nodes with padding");
3476 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
3477 ReuseShuffleIndices.end());
3478 if (ReorderIndices.empty()) {
3479 Last->Scalars.assign(VL.begin(), VL.end());
3480 Last->setOperations(S);
3481 } else {
3482 // Reorder scalars and build final mask.
3483 Last->Scalars.assign(VL.size(), nullptr);
3484 transform(ReorderIndices, Last->Scalars.begin(),
3485 [VL](unsigned Idx) -> Value * {
3486 if (Idx >= VL.size())
3487 return UndefValue::get(VL.front()->getType());
3488 return VL[Idx];
3489 });
3490 InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
3491 Last->setOperations(S);
3492 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
3493 }
3494 if (!Last->isGather()) {
3495 for (Value *V : VL) {
3496 const TreeEntry *TE = getTreeEntry(V);
3497 assert((!TE || TE == Last || doesNotNeedToBeScheduled(V)) &&
3498 "Scalar already in tree!");
3499 if (TE) {
3500 if (TE != Last)
3501 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(Last);
3502 continue;
3503 }
3504 ScalarToTreeEntry[V] = Last;
3505 }
3506 // Update the scheduler bundle to point to this TreeEntry.
3507 ScheduleData *BundleMember = *Bundle;
3508 assert((BundleMember || isa<PHINode>(S.MainOp) ||
3509 isVectorLikeInstWithConstOps(S.MainOp) ||
3510 doesNotNeedToSchedule(VL)) &&
3511 "Bundle and VL out of sync");
3512 if (BundleMember) {
3513 for (Value *V : VL) {
3515 continue;
3516 if (!BundleMember)
3517 continue;
3518 BundleMember->TE = Last;
3519 BundleMember = BundleMember->NextInBundle;
3520 }
3521 }
3522 assert(!BundleMember && "Bundle and VL out of sync");
3523 } else {
3524 // Build a map for gathered scalars to the nodes where they are used.
3525 bool AllConstsOrCasts = true;
3526 for (Value *V : VL)
3527 if (!isConstant(V)) {
3528 auto *I = dyn_cast<CastInst>(V);
3529 AllConstsOrCasts &= I && I->getType()->isIntegerTy();
3530 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
3531 !UserTreeIdx.UserTE->isGather())
3532 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
3533 }
3534 if (AllConstsOrCasts)
3535 CastMaxMinBWSizes =
3536 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
3537 MustGather.insert(VL.begin(), VL.end());
3538 }
3539
3540 if (UserTreeIdx.UserTE)
3541 Last->UserTreeIndices.push_back(UserTreeIdx);
3542 return Last;
3543 }
3544
3545 /// -- Vectorization State --
3546 /// Holds all of the tree entries.
3547 TreeEntry::VecTreeTy VectorizableTree;
3548
3549#ifndef NDEBUG
3550 /// Debug printer.
3551 LLVM_DUMP_METHOD void dumpVectorizableTree() const {
3552 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3553 VectorizableTree[Id]->dump();
3554 dbgs() << "\n";
3555 }
3556 }
3557#endif
3558
3559 TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); }
3560
3561 const TreeEntry *getTreeEntry(Value *V) const {
3562 return ScalarToTreeEntry.lookup(V);
3563 }
3564
3565 /// Check that the operand node of alternate node does not generate
3566 /// buildvector sequence. If it is, then probably not worth it to build
3567 /// alternate shuffle, if number of buildvector operands + alternate
3568 /// instruction > than the number of buildvector instructions.
3569 /// \param S the instructions state of the analyzed values.
3570 /// \param VL list of the instructions with alternate opcodes.
3571 bool areAltOperandsProfitable(const InstructionsState &S,
3572 ArrayRef<Value *> VL) const;
3573
3574 /// Checks if the specified list of the instructions/values can be vectorized
3575 /// and fills required data before actual scheduling of the instructions.
3576 TreeEntry::EntryState getScalarsVectorizationState(
3577 InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
3578 OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps);
3579
3580 /// Maps a specific scalar to its tree entry.
3581 SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry;
3582
3583 /// List of scalars, used in several vectorize nodes, and the list of the
3584 /// nodes.
3586
3587 /// Maps a value to the proposed vectorizable size.
3588 SmallDenseMap<Value *, unsigned> InstrElementSize;
3589
3590 /// A list of scalars that we found that we need to keep as scalars.
3591 ValueSet MustGather;
3592
3593 /// A set of first non-schedulable values.
3594 ValueSet NonScheduledFirst;
3595
3596 /// A map between the vectorized entries and the last instructions in the
3597 /// bundles. The bundles are built in use order, not in the def order of the
3598 /// instructions. So, we cannot rely directly on the last instruction in the
3599 /// bundle being the last instruction in the program order during
3600 /// vectorization process since the basic blocks are affected, need to
3601 /// pre-gather them before.
3602 DenseMap<const TreeEntry *, Instruction *> EntryToLastInstruction;
3603
3604 /// List of gather nodes, depending on other gather/vector nodes, which should
3605 /// be emitted after the vector instruction emission process to correctly
3606 /// handle order of the vector instructions and shuffles.
3607 SetVector<const TreeEntry *> PostponedGathers;
3608
3609 using ValueToGatherNodesMap =
3611 ValueToGatherNodesMap ValueToGatherNodes;
3612
3613 /// The index of the first gathered load entry in the VectorizeTree.
3614 constexpr static int NoGatheredLoads = -1;
3615 int GatheredLoadsEntriesFirst = NoGatheredLoads;
3616
3617 /// This POD struct describes one external user in the vectorized tree.
3618 struct ExternalUser {
3619 ExternalUser(Value *S, llvm::User *U, int L)
3620 : Scalar(S), User(U), Lane(L) {}
3621
3622 // Which scalar in our function.
3623 Value *Scalar;
3624
3625 // Which user that uses the scalar.
3627
3628 // Which lane does the scalar belong to.
3629 int Lane;
3630 };
3631 using UserList = SmallVector<ExternalUser, 16>;
3632
3633 /// Checks if two instructions may access the same memory.
3634 ///
3635 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
3636 /// is invariant in the calling loop.
3637 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
3638 Instruction *Inst2) {
3639 if (!Loc1.Ptr || !isSimple(Inst1) || !isSimple(Inst2))
3640 return true;
3641 // First check if the result is already in the cache.
3642 AliasCacheKey Key = std::make_pair(Inst1, Inst2);
3643 auto It = AliasCache.find(Key);
3644 if (It != AliasCache.end())
3645 return It->second;
3646 bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
3647 // Store the result in the cache.
3648 AliasCache.try_emplace(Key, Aliased);
3649 AliasCache.try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3650 return Aliased;
3651 }
3652
3653 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3654
3655 /// Cache for alias results.
3656 /// TODO: consider moving this to the AliasAnalysis itself.
3658
3659 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
3660 // globally through SLP because we don't perform any action which
3661 // invalidates capture results.
3662 BatchAAResults BatchAA;
3663
3664 /// Temporary store for deleted instructions. Instructions will be deleted
3665 /// eventually when the BoUpSLP is destructed. The deferral is required to
3666 /// ensure that there are no incorrect collisions in the AliasCache, which
3667 /// can happen if a new instruction is allocated at the same address as a
3668 /// previously deleted instruction.
3669 DenseSet<Instruction *> DeletedInstructions;
3670
3671 /// Set of the instruction, being analyzed already for reductions.
3672 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
3673
3674 /// Set of hashes for the list of reduction values already being analyzed.
3675 DenseSet<size_t> AnalyzedReductionVals;
3676
3677 /// Values, already been analyzed for mininmal bitwidth and found to be
3678 /// non-profitable.
3679 DenseSet<Value *> AnalyzedMinBWVals;
3680
3681 /// A list of values that need to extracted out of the tree.
3682 /// This list holds pairs of (Internal Scalar : External User). External User
3683 /// can be nullptr, it means that this Internal Scalar will be used later,
3684 /// after vectorization.
3685 UserList ExternalUses;
3686
3687 /// A list of GEPs which can be reaplced by scalar GEPs instead of
3688 /// extractelement instructions.
3689 SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
3690
3691 /// Values used only by @llvm.assume calls.
3693
3694 /// Holds all of the instructions that we gathered, shuffle instructions and
3695 /// extractelements.
3696 SetVector<Instruction *> GatherShuffleExtractSeq;
3697
3698 /// A list of blocks that we are going to CSE.
3699 DenseSet<BasicBlock *> CSEBlocks;
3700
3701 /// List of hashes of vector of loads, which are known to be non vectorizable.
3702 DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
3703
3704 /// Contains all scheduling relevant data for an instruction.
3705 /// A ScheduleData either represents a single instruction or a member of an
3706 /// instruction bundle (= a group of instructions which is combined into a
3707 /// vector instruction).
3708 struct ScheduleData {
3709 // The initial value for the dependency counters. It means that the
3710 // dependencies are not calculated yet.
3711 enum { InvalidDeps = -1 };
3712
3713 ScheduleData() = default;
3714
3715 void init(int BlockSchedulingRegionID, Instruction *I) {
3716 FirstInBundle = this;
3717 NextInBundle = nullptr;
3718 NextLoadStore = nullptr;
3719 IsScheduled = false;
3720 SchedulingRegionID = BlockSchedulingRegionID;
3721 clearDependencies();
3722 Inst = I;
3723 TE = nullptr;
3724 }
3725
3726 /// Verify basic self consistency properties
3727 void verify() {
3728 if (hasValidDependencies()) {
3729 assert(UnscheduledDeps <= Dependencies && "invariant");
3730 } else {
3731 assert(UnscheduledDeps == Dependencies && "invariant");
3732 }
3733
3734 if (IsScheduled) {
3735 assert(isSchedulingEntity() &&
3736 "unexpected scheduled state");
3737 for (const ScheduleData *BundleMember = this; BundleMember;
3738 BundleMember = BundleMember->NextInBundle) {
3739 assert(BundleMember->hasValidDependencies() &&
3740 BundleMember->UnscheduledDeps == 0 &&
3741 "unexpected scheduled state");
3742 assert((BundleMember == this || !BundleMember->IsScheduled) &&
3743 "only bundle is marked scheduled");
3744 }
3745 }
3746
3747 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3748 "all bundle members must be in same basic block");
3749 }
3750
3751 /// Returns true if the dependency information has been calculated.
3752 /// Note that depenendency validity can vary between instructions within
3753 /// a single bundle.
3754 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
3755
3756 /// Returns true for single instructions and for bundle representatives
3757 /// (= the head of a bundle).
3758 bool isSchedulingEntity() const { return FirstInBundle == this; }
3759
3760 /// Returns true if it represents an instruction bundle and not only a
3761 /// single instruction.
3762 bool isPartOfBundle() const {
3763 return NextInBundle != nullptr || FirstInBundle != this || TE;
3764 }
3765
3766 /// Returns true if it is ready for scheduling, i.e. it has no more
3767 /// unscheduled depending instructions/bundles.
3768 bool isReady() const {
3769 assert(isSchedulingEntity() &&
3770 "can't consider non-scheduling entity for ready list");
3771 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3772 }
3773
3774 /// Modifies the number of unscheduled dependencies for this instruction,
3775 /// and returns the number of remaining dependencies for the containing
3776 /// bundle.
3777 int incrementUnscheduledDeps(int Incr) {
3778 assert(hasValidDependencies() &&
3779 "increment of unscheduled deps would be meaningless");
3780 UnscheduledDeps += Incr;
3781 return FirstInBundle->unscheduledDepsInBundle();
3782 }
3783
3784 /// Sets the number of unscheduled dependencies to the number of
3785 /// dependencies.
3786 void resetUnscheduledDeps() {
3787 UnscheduledDeps = Dependencies;
3788 }
3789
3790 /// Clears all dependency information.
3791 void clearDependencies() {
3792 Dependencies = InvalidDeps;
3793 resetUnscheduledDeps();
3794 MemoryDependencies.clear();
3795 ControlDependencies.clear();
3796 }
3797
3798 int unscheduledDepsInBundle() const {
3799 assert(isSchedulingEntity() && "only meaningful on the bundle");
3800 int Sum = 0;
3801 for (const ScheduleData *BundleMember = this; BundleMember;
3802 BundleMember = BundleMember->NextInBundle) {
3803 if (BundleMember->UnscheduledDeps == InvalidDeps)
3804 return InvalidDeps;
3805 Sum += BundleMember->UnscheduledDeps;
3806 }
3807 return Sum;
3808 }
3809
3810 void dump(raw_ostream &os) const {
3811 if (!isSchedulingEntity()) {
3812 os << "/ " << *Inst;
3813 } else if (NextInBundle) {
3814 os << '[' << *Inst;
3815 ScheduleData *SD = NextInBundle;
3816 while (SD) {
3817 os << ';' << *SD->Inst;
3818 SD = SD->NextInBundle;
3819 }
3820 os << ']';
3821 } else {
3822 os << *Inst;
3823 }
3824 }
3825
3826 Instruction *Inst = nullptr;
3827
3828 /// The TreeEntry that this instruction corresponds to.
3829 TreeEntry *TE = nullptr;
3830
3831 /// Points to the head in an instruction bundle (and always to this for
3832 /// single instructions).
3833 ScheduleData *FirstInBundle = nullptr;
3834
3835 /// Single linked list of all instructions in a bundle. Null if it is a
3836 /// single instruction.
3837 ScheduleData *NextInBundle = nullptr;
3838
3839 /// Single linked list of all memory instructions (e.g. load, store, call)
3840 /// in the block - until the end of the scheduling region.
3841 ScheduleData *NextLoadStore = nullptr;
3842
3843 /// The dependent memory instructions.
3844 /// This list is derived on demand in calculateDependencies().
3845 SmallVector<ScheduleData *, 4> MemoryDependencies;
3846
3847 /// List of instructions which this instruction could be control dependent
3848 /// on. Allowing such nodes to be scheduled below this one could introduce
3849 /// a runtime fault which didn't exist in the original program.
3850 /// ex: this is a load or udiv following a readonly call which inf loops
3851 SmallVector<ScheduleData *, 4> ControlDependencies;
3852
3853 /// This ScheduleData is in the current scheduling region if this matches
3854 /// the current SchedulingRegionID of BlockScheduling.
3855 int SchedulingRegionID = 0;
3856
3857 /// Used for getting a "good" final ordering of instructions.
3858 int SchedulingPriority = 0;
3859
3860 /// The number of dependencies. Constitutes of the number of users of the
3861 /// instruction plus the number of dependent memory instructions (if any).
3862 /// This value is calculated on demand.
3863 /// If InvalidDeps, the number of dependencies is not calculated yet.
3864 int Dependencies = InvalidDeps;
3865
3866 /// The number of dependencies minus the number of dependencies of scheduled
3867 /// instructions. As soon as this is zero, the instruction/bundle gets ready
3868 /// for scheduling.
3869 /// Note that this is negative as long as Dependencies is not calculated.
3870 int UnscheduledDeps = InvalidDeps;
3871
3872 /// True if this instruction is scheduled (or considered as scheduled in the
3873 /// dry-run).
3874 bool IsScheduled = false;
3875 };
3876
3877#ifndef NDEBUG
3879 const BoUpSLP::ScheduleData &SD) {
3880 SD.dump(os);
3881 return os;
3882 }
3883#endif
3884
3885 friend struct GraphTraits<BoUpSLP *>;
3886 friend struct DOTGraphTraits<BoUpSLP *>;
3887
3888 /// Contains all scheduling data for a basic block.
3889 /// It does not schedules instructions, which are not memory read/write
3890 /// instructions and their operands are either constants, or arguments, or
3891 /// phis, or instructions from others blocks, or their users are phis or from
3892 /// the other blocks. The resulting vector instructions can be placed at the
3893 /// beginning of the basic block without scheduling (if operands does not need
3894 /// to be scheduled) or at the end of the block (if users are outside of the
3895 /// block). It allows to save some compile time and memory used by the
3896 /// compiler.
3897 /// ScheduleData is assigned for each instruction in between the boundaries of
3898 /// the tree entry, even for those, which are not part of the graph. It is
3899 /// required to correctly follow the dependencies between the instructions and
3900 /// their correct scheduling. The ScheduleData is not allocated for the
3901 /// instructions, which do not require scheduling, like phis, nodes with
3902 /// extractelements/insertelements only or nodes with instructions, with
3903 /// uses/operands outside of the block.
3904 struct BlockScheduling {
3905 BlockScheduling(BasicBlock *BB)
3906 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
3907
3908 void clear() {
3909 ReadyInsts.clear();
3910 ScheduleStart = nullptr;
3911 ScheduleEnd = nullptr;
3912 FirstLoadStoreInRegion = nullptr;
3913 LastLoadStoreInRegion = nullptr;
3914 RegionHasStackSave = false;
3915
3916 // Reduce the maximum schedule region size by the size of the
3917 // previous scheduling run.
3918 ScheduleRegionSizeLimit -= ScheduleRegionSize;
3919 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
3920 ScheduleRegionSizeLimit = MinScheduleRegionSize;
3921 ScheduleRegionSize = 0;
3922
3923 // Make a new scheduling region, i.e. all existing ScheduleData is not
3924 // in the new region yet.
3925 ++SchedulingRegionID;
3926 }
3927
3928 ScheduleData *getScheduleData(Instruction *I) {
3929 if (BB != I->getParent())
3930 // Avoid lookup if can't possibly be in map.
3931 return nullptr;
3932 ScheduleData *SD = ScheduleDataMap.lookup(I);
3933 if (SD && isInSchedulingRegion(SD))
3934 return SD;
3935 return nullptr;
3936 }
3937
3938 ScheduleData *getScheduleData(Value *V) {
3939 if (auto *I = dyn_cast<Instruction>(V))
3940 return getScheduleData(I);
3941 return nullptr;
3942 }
3943
3944 bool isInSchedulingRegion(ScheduleData *SD) const {
3945 return SD->SchedulingRegionID == SchedulingRegionID;
3946 }
3947
3948 /// Marks an instruction as scheduled and puts all dependent ready
3949 /// instructions into the ready-list.
3950 template <typename ReadyListType>
3951 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
3952 SD->IsScheduled = true;
3953 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
3954
3955 for (ScheduleData *BundleMember = SD; BundleMember;
3956 BundleMember = BundleMember->NextInBundle) {
3957
3958 // Handle the def-use chain dependencies.
3959
3960 // Decrement the unscheduled counter and insert to ready list if ready.
3961 auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
3962 ScheduleData *OpDef = getScheduleData(I);
3963 if (OpDef && OpDef->hasValidDependencies() &&
3964 OpDef->incrementUnscheduledDeps(-1) == 0) {
3965 // There are no more unscheduled dependencies after
3966 // decrementing, so we can put the dependent instruction
3967 // into the ready list.
3968 ScheduleData *DepBundle = OpDef->FirstInBundle;
3969 assert(!DepBundle->IsScheduled &&
3970 "already scheduled bundle gets ready");
3971 ReadyList.insert(DepBundle);
3973 << "SLP: gets ready (def): " << *DepBundle << "\n");
3974 }
3975 };
3976
3977 // If BundleMember is a vector bundle, its operands may have been
3978 // reordered during buildTree(). We therefore need to get its operands
3979 // through the TreeEntry.
3980 if (TreeEntry *TE = BundleMember->TE) {
3981 // Need to search for the lane since the tree entry can be reordered.
3982 int Lane = std::distance(TE->Scalars.begin(),
3983 find(TE->Scalars, BundleMember->Inst));
3984 assert(Lane >= 0 && "Lane not set");
3985
3986 // Since vectorization tree is being built recursively this assertion
3987 // ensures that the tree entry has all operands set before reaching
3988 // this code. Couple of exceptions known at the moment are extracts
3989 // where their second (immediate) operand is not added. Since
3990 // immediates do not affect scheduler behavior this is considered
3991 // okay.
3992 auto *In = BundleMember->Inst;
3993 assert(
3994 In &&
3996 In->getNumOperands() == TE->getNumOperands()) &&
3997 "Missed TreeEntry operands?");
3998 (void)In; // fake use to avoid build failure when assertions disabled
3999
4000 for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
4001 OpIdx != NumOperands; ++OpIdx)
4002 if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
4003 DecrUnsched(I);
4004 } else {
4005 // If BundleMember is a stand-alone instruction, no operand reordering
4006 // has taken place, so we directly access its operands.
4007 for (Use &U : BundleMember->Inst->operands())
4008 if (auto *I = dyn_cast<Instruction>(U.get()))
4009 DecrUnsched(I);
4010 }
4011 // Handle the memory dependencies.
4012 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
4013 if (MemoryDepSD->hasValidDependencies() &&
4014 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
4015 // There are no more unscheduled dependencies after decrementing,
4016 // so we can put the dependent instruction into the ready list.
4017 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
4018 assert(!DepBundle->IsScheduled &&
4019 "already scheduled bundle gets ready");
4020 ReadyList.insert(DepBundle);
4022 << "SLP: gets ready (mem): " << *DepBundle << "\n");
4023 }
4024 }
4025 // Handle the control dependencies.
4026 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
4027 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
4028 // There are no more unscheduled dependencies after decrementing,
4029 // so we can put the dependent instruction into the ready list.
4030 ScheduleData *DepBundle = DepSD->FirstInBundle;
4031 assert(!DepBundle->IsScheduled &&
4032 "already scheduled bundle gets ready");
4033 ReadyList.insert(DepBundle);
4035 << "SLP: gets ready (ctl): " << *DepBundle << "\n");
4036 }
4037 }
4038 }
4039 }
4040
4041 /// Verify basic self consistency properties of the data structure.
4042 void verify() {
4043 if (!ScheduleStart)
4044 return;
4045
4046 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
4047 ScheduleStart->comesBefore(ScheduleEnd) &&
4048 "Not a valid scheduling region?");
4049
4050 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
4051 auto *SD = getScheduleData(I);
4052 if (!SD)
4053 continue;
4054 assert(isInSchedulingRegion(SD) &&
4055 "primary schedule data not in window?");
4056 assert(isInSchedulingRegion(SD->FirstInBundle) &&
4057 "entire bundle in window!");
4058 SD->verify();
4059 }
4060
4061 for (auto *SD : ReadyInsts) {
4062 assert(SD->isSchedulingEntity() && SD->isReady() &&
4063 "item in ready list not ready?");
4064 (void)SD;
4065 }
4066 }
4067
4068 /// Put all instructions into the ReadyList which are ready for scheduling.
4069 template <typename ReadyListType>
4070 void initialFillReadyList(ReadyListType &ReadyList) {
4071 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
4072 ScheduleData *SD = getScheduleData(I);
4073 if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies() &&
4074 SD->isReady()) {
4075 ReadyList.insert(SD);
4077 << "SLP: initially in ready list: " << *SD << "\n");
4078 }
4079 }
4080 }
4081
4082 /// Build a bundle from the ScheduleData nodes corresponding to the
4083 /// scalar instruction for each lane.
4084 ScheduleData *buildBundle(ArrayRef<Value *> VL);
4085
4086 /// Checks if a bundle of instructions can be scheduled, i.e. has no
4087 /// cyclic dependencies. This is only a dry-run, no instructions are
4088 /// actually moved at this stage.
4089 /// \returns the scheduling bundle. The returned Optional value is not
4090 /// std::nullopt if \p VL is allowed to be scheduled.
4091 std::optional<ScheduleData *>
4092 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
4093 const InstructionsState &S);
4094
4095 /// Un-bundles a group of instructions.
4096 void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
4097
4098 /// Allocates schedule data chunk.
4099 ScheduleData *allocateScheduleDataChunks();
4100
4101 /// Extends the scheduling region so that V is inside the region.
4102 /// \returns true if the region size is within the limit.
4103 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
4104
4105 /// Initialize the ScheduleData structures for new instructions in the
4106 /// scheduling region.
4107 void initScheduleData(Instruction *FromI, Instruction *ToI,
4108 ScheduleData *PrevLoadStore,
4109 ScheduleData *NextLoadStore);
4110
4111 /// Updates the dependency information of a bundle and of all instructions/
4112 /// bundles which depend on the original bundle.
4113 void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
4114 BoUpSLP *SLP);
4115
4116 /// Sets all instruction in the scheduling region to un-scheduled.
4117 void resetSchedule();
4118
4119 BasicBlock *BB;
4120
4121 /// Simple memory allocation for ScheduleData.
4123
4124 /// The size of a ScheduleData array in ScheduleDataChunks.
4125 int ChunkSize;
4126
4127 /// The allocator position in the current chunk, which is the last entry
4128 /// of ScheduleDataChunks.
4129 int ChunkPos;
4130
4131 /// Attaches ScheduleData to Instruction.
4132 /// Note that the mapping survives during all vectorization iterations, i.e.
4133 /// ScheduleData structures are recycled.
4135
4136 /// The ready-list for scheduling (only used for the dry-run).
4137 SetVector<ScheduleData *> ReadyInsts;
4138
4139 /// The first instruction of the scheduling region.
4140 Instruction *ScheduleStart = nullptr;
4141
4142 /// The first instruction _after_ the scheduling region.
4143 Instruction *ScheduleEnd = nullptr;
4144
4145 /// The first memory accessing instruction in the scheduling region
4146 /// (can be null).
4147 ScheduleData *FirstLoadStoreInRegion = nullptr;
4148
4149 /// The last memory accessing instruction in the scheduling region
4150 /// (can be null).
4151 ScheduleData *LastLoadStoreInRegion = nullptr;
4152
4153 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
4154 /// region? Used to optimize the dependence calculation for the
4155 /// common case where there isn't.
4156 bool RegionHasStackSave = false;
4157
4158 /// The current size of the scheduling region.
4159 int ScheduleRegionSize = 0;
4160
4161 /// The maximum size allowed for the scheduling region.
4162 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
4163
4164 /// The ID of the scheduling region. For a new vectorization iteration this
4165 /// is incremented which "removes" all ScheduleData from the region.
4166 /// Make sure that the initial SchedulingRegionID is greater than the
4167 /// initial SchedulingRegionID in ScheduleData (which is 0).
4168 int SchedulingRegionID = 1;
4169 };
4170
4171 /// Attaches the BlockScheduling structures to basic blocks.
4173
4174 /// Performs the "real" scheduling. Done before vectorization is actually
4175 /// performed in a basic block.
4176 void scheduleBlock(BlockScheduling *BS);
4177
4178 /// List of users to ignore during scheduling and that don't need extracting.
4179 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
4180
4181 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
4182 /// sorted SmallVectors of unsigned.
4183 struct OrdersTypeDenseMapInfo {
4184 static OrdersType getEmptyKey() {
4185 OrdersType V;
4186 V.push_back(~1U);
4187 return V;
4188 }
4189
4190 static OrdersType getTombstoneKey() {
4191 OrdersType V;
4192 V.push_back(~2U);
4193 return V;
4194 }
4195
4196 static unsigned getHashValue(const OrdersType &V) {
4197 return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
4198 }
4199
4200 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
4201 return LHS == RHS;
4202 }
4203 };
4204
4205 // Analysis and block reference.
4206 Function *F;
4207 ScalarEvolution *SE;
4209 TargetLibraryInfo *TLI;
4210 LoopInfo *LI;
4211 DominatorTree *DT;
4212 AssumptionCache *AC;
4213 DemandedBits *DB;
4214 const DataLayout *DL;
4216
4217 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
4218 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
4219
4220 /// Instruction builder to construct the vectorized tree.
4222
4223 /// A map of scalar integer values to the smallest bit width with which they
4224 /// can legally be represented. The values map to (width, signed) pairs,
4225 /// where "width" indicates the minimum bit width and "signed" is True if the
4226 /// value must be signed-extended, rather than zero-extended, back to its
4227 /// original width.
4229
4230 /// Final size of the reduced vector, if the current graph represents the
4231 /// input for the reduction and it was possible to narrow the size of the
4232 /// reduction.
4233 unsigned ReductionBitWidth = 0;
4234
4235 /// Canonical graph size before the transformations.
4236 unsigned BaseGraphSize = 1;
4237
4238 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
4239 /// type sizes, used in the tree.
4240 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
4241
4242 /// Indices of the vectorized nodes, which supposed to be the roots of the new
4243 /// bitwidth analysis attempt, like trunc, IToFP or ICmp.
4244 DenseSet<unsigned> ExtraBitWidthNodes;
4245};
4246
4247} // end namespace slpvectorizer
4248
4249template <> struct GraphTraits<BoUpSLP *> {
4250 using TreeEntry = BoUpSLP::TreeEntry;
4251
4252 /// NodeRef has to be a pointer per the GraphWriter.
4254
4256
4257 /// Add the VectorizableTree to the index iterator to be able to return
4258 /// TreeEntry pointers.
4260 : public iterator_adaptor_base<
4261 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
4263
4267
4268 NodeRef operator*() { return I->UserTE; }
4269 };
4270
4272 return R.VectorizableTree[0].get();
4273 }
4274
4276 return {N->UserTreeIndices.begin(), N->Container};
4277 }
4278
4280 return {N->UserTreeIndices.end(), N->Container};
4281 }
4282
4283 /// For the node iterator we just need to turn the TreeEntry iterator into a
4284 /// TreeEntry* iterator so that it dereferences to NodeRef.
4286 using ItTy = ContainerTy::iterator;
4287 ItTy It;
4288
4289 public:
4290 nodes_iterator(const ItTy &It2) : It(It2) {}
4291 NodeRef operator*() { return It->get(); }
4293 ++It;
4294 return *this;
4295 }
4296 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
4297 };
4298
4300 return nodes_iterator(R->VectorizableTree.begin());
4301 }
4302
4304 return nodes_iterator(R->VectorizableTree.end());
4305 }
4306
4307 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
4308};
4309
4310template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
4311 using TreeEntry = BoUpSLP::TreeEntry;
4312
4313 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
4314
4315 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
4316 std::string Str;
4318 OS << Entry->Idx << ".\n";
4319 if (isSplat(Entry->Scalars))
4320 OS << "<splat> ";
4321 for (auto *V : Entry->Scalars) {
4322 OS << *V;
4323 if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
4324 return EU.Scalar == V;
4325 }))
4326 OS << " <extract>";
4327 OS << "\n";
4328 }
4329 return Str;
4330 }
4331
4332 static std::string getNodeAttributes(const TreeEntry *Entry,
4333 const BoUpSLP *) {
4334 if (Entry->isGather())
4335 return "color=red";
4336 if (Entry->State == TreeEntry::ScatterVectorize ||
4337 Entry->State == TreeEntry::StridedVectorize)
4338 return "color=blue";
4339 return "";
4340 }
4341};
4342
4343} // end namespace llvm
4344
4347 for (auto *I : DeletedInstructions) {
4348 if (!I->getParent()) {
4349 // Temporarily insert instruction back to erase them from parent and
4350 // memory later.
4351 if (isa<PHINode>(I))
4352 // Phi nodes must be the very first instructions in the block.
4353 I->insertBefore(F->getEntryBlock(),
4354 F->getEntryBlock().getFirstNonPHIIt());
4355 else
4356 I->insertBefore(F->getEntryBlock().getTerminator());
4357 continue;
4358 }
4359 for (Use &U : I->operands()) {
4360 auto *Op = dyn_cast<Instruction>(U.get());
4361 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
4363 DeadInsts.emplace_back(Op);
4364 }
4365 I->dropAllReferences();
4366 }
4367 for (auto *I : DeletedInstructions) {
4368 assert(I->use_empty() &&
4369 "trying to erase instruction with users.");
4370 I->eraseFromParent();
4371 }
4372
4373 // Cleanup any dead scalar code feeding the vectorized instructions
4375
4376#ifdef EXPENSIVE_CHECKS
4377 // If we could guarantee that this call is not extremely slow, we could
4378 // remove the ifdef limitation (see PR47712).
4379 assert(!verifyFunction(*F, &dbgs()));
4380#endif
4381}
4382
4383/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
4384/// contains original mask for the scalars reused in the node. Procedure
4385/// transform this mask in accordance with the given \p Mask.
4387 assert(!Mask.empty() && Reuses.size() == Mask.size() &&
4388 "Expected non-empty mask.");
4389 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
4390 Prev.swap(Reuses);
4391 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
4392 if (Mask[I] != PoisonMaskElem)
4393 Reuses[Mask[I]] = Prev[I];
4394}
4395
4396/// Reorders the given \p Order according to the given \p Mask. \p Order - is
4397/// the original order of the scalars. Procedure transforms the provided order
4398/// in accordance with the given \p Mask. If the resulting \p Order is just an
4399/// identity order, \p Order is cleared.
4401 bool BottomOrder = false) {
4402 assert(!Mask.empty() && "Expected non-empty mask.");
4403 unsigned Sz = Mask.size();
4404 if (BottomOrder) {
4405 SmallVector<unsigned> PrevOrder;
4406 if (Order.empty()) {
4407 PrevOrder.resize(Sz);
4408 std::iota(PrevOrder.begin(), PrevOrder.end(), 0);
4409 } else {
4410 PrevOrder.swap(Order);
4411 }
4412 Order.assign(Sz, Sz);
4413 for (unsigned I = 0; I < Sz; ++I)
4414 if (Mask[I] != PoisonMaskElem)
4415 Order[I] = PrevOrder[Mask[I]];
4416 if (all_of(enumerate(Order), [&](const auto &Data) {
4417 return Data.value() == Sz || Data.index() == Data.value();
4418 })) {
4419 Order.clear();
4420 return;
4421 }
4422 fixupOrderingIndices(Order);
4423 return;
4424 }
4425 SmallVector<int> MaskOrder;
4426 if (Order.empty()) {
4427 MaskOrder.resize(Sz);
4428 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
4429 } else {
4430 inversePermutation(Order, MaskOrder);
4431 }
4432 reorderReuses(MaskOrder, Mask);
4433 if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) {
4434 Order.clear();
4435 return;
4436 }
4437 Order.assign(Sz, Sz);
4438 for (unsigned I = 0; I < Sz; ++I)
4439 if (MaskOrder[I] != PoisonMaskElem)
4440 Order[MaskOrder[I]] = I;
4441 fixupOrderingIndices(Order);
4442}
4443
4444std::optional<BoUpSLP::OrdersType>
4445BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
4446 assert(TE.isGather() && "Expected gather node only.");
4447 // Try to find subvector extract/insert patterns and reorder only such
4448 // patterns.
4449 SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
4450 Type *ScalarTy = GatheredScalars.front()->getType();
4451 int NumScalars = GatheredScalars.size();
4452 if (!isValidElementType(ScalarTy))
4453 return std::nullopt;
4454 auto *VecTy = getWidenedType(ScalarTy, NumScalars);
4455 int NumParts = TTI->getNumberOfParts(VecTy);
4456 if (NumParts == 0 || NumParts >= NumScalars ||
4457 VecTy->getNumElements() % NumParts != 0 ||
4458 !hasFullVectorsOrPowerOf2(*TTI, VecTy->getElementType(),
4459 VecTy->getNumElements() / NumParts))
4460 NumParts = 1;
4461 SmallVector<int> ExtractMask;
4462 SmallVector<int> Mask;
4465 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
4467 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
4468 /*ForOrder=*/true);
4469 // No shuffled operands - ignore.
4470 if (GatherShuffles.empty() && ExtractShuffles.empty())
4471 return std::nullopt;
4472 OrdersType CurrentOrder(NumScalars, NumScalars);
4473 if (GatherShuffles.size() == 1 &&
4474 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
4475 Entries.front().front()->isSame(TE.Scalars)) {
4476 // Perfect match in the graph, will reuse the previously vectorized
4477 // node. Cost is 0.
4478 std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
4479 return CurrentOrder;
4480 }
4481 auto IsSplatMask = [](ArrayRef<int> Mask) {
4482 int SingleElt = PoisonMaskElem;
4483 return all_of(Mask, [&](int I) {
4484 if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
4485 SingleElt = I;
4486 return I == PoisonMaskElem || I == SingleElt;
4487 });
4488 };
4489 // Exclusive broadcast mask - ignore.
4490 if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
4491 (Entries.size() != 1 ||
4492 Entries.front().front()->ReorderIndices.empty())) ||
4493 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
4494 return std::nullopt;
4495 SmallBitVector ShuffledSubMasks(NumParts);
4496 auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
4497 ArrayRef<int> Mask, int PartSz, int NumParts,
4498 function_ref<unsigned(unsigned)> GetVF) {
4499 for (int I : seq<int>(0, NumParts)) {
4500 if (ShuffledSubMasks.test(I))
4501 continue;
4502 const int VF = GetVF(I);
4503 if (VF == 0)
4504 continue;
4505 unsigned Limit = getNumElems(CurrentOrder.size(), PartSz, I);
4506 MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, Limit);
4507 // Shuffle of at least 2 vectors - ignore.
4508 if (any_of(Slice, [&](int I) { return I != NumScalars; })) {
4509 std::fill(Slice.begin(), Slice.end(), NumScalars);
4510 ShuffledSubMasks.set(I);
4511 continue;
4512 }
4513 // Try to include as much elements from the mask as possible.
4514 int FirstMin = INT_MAX;
4515 int SecondVecFound = false;
4516 for (int K : seq<int>(Limit)) {
4517 int Idx = Mask[I * PartSz + K];
4518 if (Idx == PoisonMaskElem) {
4519 Value *V = GatheredScalars[I * PartSz + K];
4520 if (isConstant(V) && !isa<PoisonValue>(V)) {
4521 SecondVecFound = true;
4522 break;
4523 }
4524 continue;
4525 }
4526 if (Idx < VF) {
4527 if (FirstMin > Idx)
4528 FirstMin = Idx;
4529 } else {
4530 SecondVecFound = true;
4531 break;
4532 }
4533 }
4534 FirstMin = (FirstMin / PartSz) * PartSz;
4535 // Shuffle of at least 2 vectors - ignore.
4536 if (SecondVecFound) {
4537 std::fill(Slice.begin(), Slice.end(), NumScalars);
4538 ShuffledSubMasks.set(I);
4539 continue;
4540 }
4541 for (int K : seq<int>(Limit)) {
4542 int Idx = Mask[I * PartSz + K];
4543 if (Idx == PoisonMaskElem)
4544 continue;
4545 Idx -= FirstMin;
4546 if (Idx >= PartSz) {
4547 SecondVecFound = true;
4548 break;
4549 }
4550 if (CurrentOrder[I * PartSz + Idx] >
4551 static_cast<unsigned>(I * PartSz + K) &&
4552 CurrentOrder[I * PartSz + Idx] !=
4553 static_cast<unsigned>(I * PartSz + Idx))
4554 CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
4555 }
4556 // Shuffle of at least 2 vectors - ignore.
4557 if (SecondVecFound) {
4558 std::fill(Slice.begin(), Slice.end(), NumScalars);
4559 ShuffledSubMasks.set(I);
4560 continue;
4561 }
4562 }
4563 };
4564 int PartSz = getPartNumElems(NumScalars, NumParts);
4565 if (!ExtractShuffles.empty())
4566 TransformMaskToOrder(
4567 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
4568 if (!ExtractShuffles[I])
4569 return 0U;
4570 unsigned VF = 0;
4571 unsigned Sz = getNumElems(TE.getVectorFactor(), PartSz, I);
4572 for (unsigned Idx : seq<unsigned>(Sz)) {
4573 int K = I * PartSz + Idx;
4574 if (ExtractMask[K] == PoisonMaskElem)
4575 continue;
4576 if (!TE.ReuseShuffleIndices.empty())
4577 K = TE.ReuseShuffleIndices[K];
4578 if (!TE.ReorderIndices.empty())
4579 K = std::distance(TE.ReorderIndices.begin(),
4580 find(TE.ReorderIndices, K));
4581 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
4582 if (!EI)
4583 continue;
4584 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
4585 ->getElementCount()
4586 .getKnownMinValue());
4587 }
4588 return VF;
4589 });
4590 // Check special corner case - single shuffle of the same entry.
4591 if (GatherShuffles.size() == 1 && NumParts != 1) {
4592 if (ShuffledSubMasks.any())
4593 return std::nullopt;
4594 PartSz = NumScalars;
4595 NumParts = 1;
4596 }
4597 if (!Entries.empty())
4598 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
4599 if (!GatherShuffles[I])
4600 return 0U;
4601 return std::max(Entries[I].front()->getVectorFactor(),
4602 Entries[I].back()->getVectorFactor());
4603 });
4604 int NumUndefs =
4605 count_if(CurrentOrder, [&](int Idx) { return Idx == NumScalars; });
4606 if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
4607 return std::nullopt;
4608 return std::move(CurrentOrder);
4609}
4610
4611static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
4612 const TargetLibraryInfo &TLI,
4613 bool CompareOpcodes = true) {
4614 if (getUnderlyingObject(Ptr1) != getUnderlyingObject(Ptr2))
4615 return false;
4616 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
4617 if (!GEP1)
4618 return false;
4619 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
4620 if (!GEP2)
4621 return false;
4622 return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 &&
4623 ((isConstant(GEP1->getOperand(1)) &&
4624 isConstant(GEP2->getOperand(1))) ||
4625 !CompareOpcodes ||
4626 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
4627 .getOpcode());
4628}
4629
4630/// Calculates minimal alignment as a common alignment.
4631template <typename T>
4633 Align CommonAlignment = cast<T>(VL.front())->getAlign();
4634 for (Value *V : VL.drop_front())
4635 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
4636 return CommonAlignment;
4637}
4638
4639/// Check if \p Order represents reverse order.
4641 unsigned Sz = Order.size();
4642 return !Order.empty() && all_of(enumerate(Order), [&](const auto &Pair) {
4643 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4644 });
4645}
4646
4647/// Checks if the provided list of pointers \p Pointers represents the strided
4648/// pointers for type ElemTy. If they are not, std::nullopt is returned.
4649/// Otherwise, if \p Inst is not specified, just initialized optional value is
4650/// returned to show that the pointers represent strided pointers. If \p Inst
4651/// specified, the runtime stride is materialized before the given \p Inst.
4652/// \returns std::nullopt if the pointers are not pointers with the runtime
4653/// stride, nullptr or actual stride value, otherwise.
4654static std::optional<Value *>
4656 const DataLayout &DL, ScalarEvolution &SE,
4657 SmallVectorImpl<unsigned> &SortedIndices,
4658 Instruction *Inst = nullptr) {
4660 const SCEV *PtrSCEVLowest = nullptr;
4661 const SCEV *PtrSCEVHighest = nullptr;
4662 // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
4663 // addresses).
4664 for (Value *Ptr : PointerOps) {
4665 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
4666 if (!PtrSCEV)
4667 return std::nullopt;
4668 SCEVs.push_back(PtrSCEV);
4669 if (!PtrSCEVLowest && !PtrSCEVHighest) {
4670 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4671 continue;
4672 }
4673 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4674 if (isa<SCEVCouldNotCompute>(Diff))
4675 return std::nullopt;
4676 if (Diff->isNonConstantNegative()) {
4677 PtrSCEVLowest = PtrSCEV;
4678 continue;
4679 }
4680 const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
4681 if (isa<SCEVCouldNotCompute>(Diff1))
4682 return std::nullopt;
4683 if (Diff1->isNonConstantNegative()) {
4684 PtrSCEVHighest = PtrSCEV;
4685 continue;
4686 }
4687 }
4688 // Dist = PtrSCEVHighest - PtrSCEVLowest;
4689 const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
4690 if (isa<SCEVCouldNotCompute>(Dist))
4691 return std::nullopt;
4692 int Size = DL.getTypeStoreSize(ElemTy);
4693 auto TryGetStride = [&](const SCEV *Dist,
4694 const SCEV *Multiplier) -> const SCEV * {
4695 if (const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
4696 if (M->getOperand(0) == Multiplier)
4697 return M->getOperand(1);
4698 if (M->getOperand(1) == Multiplier)
4699 return M->getOperand(0);
4700 return nullptr;
4701 }
4702 if (Multiplier == Dist)
4703 return SE.getConstant(Dist->getType(), 1);
4704 return SE.getUDivExactExpr(Dist, Multiplier);
4705 };
4706 // Stride_in_elements = Dist / element_size * (num_elems - 1).
4707 const SCEV *Stride = nullptr;
4708 if (Size != 1 || SCEVs.size() > 2) {
4709 const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
4710 Stride = TryGetStride(Dist, Sz);
4711 if (!Stride)
4712 return std::nullopt;
4713 }
4714 if (!Stride || isa<SCEVConstant>(Stride))
4715 return std::nullopt;
4716 // Iterate through all pointers and check if all distances are
4717 // unique multiple of Stride.
4718 using DistOrdPair = std::pair<int64_t, int>;
4719 auto Compare = llvm::less_first();
4720 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
4721 int Cnt = 0;
4722 bool IsConsecutive = true;
4723 for (const SCEV *PtrSCEV : SCEVs) {
4724 unsigned Dist = 0;
4725 if (PtrSCEV != PtrSCEVLowest) {
4726 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4727 const SCEV *Coeff = TryGetStride(Diff, Stride);
4728 if (!Coeff)
4729 return std::nullopt;
4730 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
4731 if (!SC || isa<SCEVCouldNotCompute>(SC))
4732 return std::nullopt;
4733 if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
4734 SE.getMulExpr(Stride, SC)))
4735 ->isZero())
4736 return std::nullopt;
4737 Dist = SC->getAPInt().getZExtValue();
4738 }
4739 // If the strides are not the same or repeated, we can't vectorize.
4740 if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
4741 return std::nullopt;
4742 auto Res = Offsets.emplace(Dist, Cnt);
4743 if (!Res.second)
4744 return std::nullopt;
4745 // Consecutive order if the inserted element is the last one.
4746 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
4747 ++Cnt;
4748 }
4749 if (Offsets.size() != SCEVs.size())
4750 return std::nullopt;
4751 SortedIndices.clear();
4752 if (!IsConsecutive) {
4753 // Fill SortedIndices array only if it is non-consecutive.
4754 SortedIndices.resize(PointerOps.size());
4755 Cnt = 0;
4756 for (const std::pair<int64_t, int> &Pair : Offsets) {
4757 SortedIndices[Cnt] = Pair.second;
4758 ++Cnt;
4759 }
4760 }
4761 if (!Inst)
4762 return nullptr;
4763 SCEVExpander Expander(SE, DL, "strided-load-vec");
4764 return Expander.expandCodeFor(Stride, Stride->getType(), Inst);
4765}
4766
4767static std::pair<InstructionCost, InstructionCost>
4769 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
4770 Type *ScalarTy, VectorType *VecTy);
4771
4772/// Returns the cost of the shuffle instructions with the given \p Kind, vector
4773/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
4774/// subvector pattern.
4775static InstructionCost
4777 VectorType *Tp, ArrayRef<int> Mask = {},
4779 int Index = 0, VectorType *SubTp = nullptr,
4781 if (Kind != TTI::SK_PermuteTwoSrc)
4782 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
4783 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
4784 int NumSubElts;
4786 Mask, NumSrcElts, NumSubElts, Index)) {
4787 if (Index + NumSubElts > NumSrcElts &&
4788 Index + NumSrcElts <= static_cast<int>(Mask.size()))
4789 return TTI.getShuffleCost(
4791 getWidenedType(Tp->getElementType(), Mask.size()), Mask,
4793 }
4794 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
4795}
4796
4800 SmallVectorImpl<Value *> &PointerOps,
4801 unsigned *BestVF, bool TryRecursiveCheck) const {
4802 // Check that a vectorized load would load the same memory as a scalar
4803 // load. For example, we don't want to vectorize loads that are smaller
4804 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
4805 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
4806 // from such a struct, we read/write packed bits disagreeing with the
4807 // unvectorized version.
4808 if (BestVF)
4809 *BestVF = 0;
4811 return LoadsState::Gather;
4812 Type *ScalarTy = VL0->getType();
4813
4814 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
4815 return LoadsState::Gather;
4816
4817 // Make sure all loads in the bundle are simple - we can't vectorize
4818 // atomic or volatile loads.
4819 PointerOps.clear();
4820 const unsigned Sz = VL.size();
4821 PointerOps.resize(Sz);
4822 auto *POIter = PointerOps.begin();
4823 for (Value *V : VL) {
4824 auto *L = cast<LoadInst>(V);
4825 if (!L->isSimple())
4826 return LoadsState::Gather;
4827 *POIter = L->getPointerOperand();
4828 ++POIter;
4829 }
4830
4831 Order.clear();
4832 // Check the order of pointer operands or that all pointers are the same.
4833 bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
4834
4835 auto *VecTy = getWidenedType(ScalarTy, Sz);
4836 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4837 if (!IsSorted) {
4838 if (Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy)) {
4839 if (TTI->isLegalStridedLoadStore(VecTy, CommonAlignment) &&
4840 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order))
4842 }
4843
4844 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
4845 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
4846 return LoadsState::Gather;
4847
4848 if (!all_of(PointerOps, [&](Value *P) {
4849 return arePointersCompatible(P, PointerOps.front(), *TLI);
4850 }))
4851 return LoadsState::Gather;
4852
4853 } else {
4854 Value *Ptr0;
4855 Value *PtrN;
4856 if (Order.empty()) {
4857 Ptr0 = PointerOps.front();
4858 PtrN = PointerOps.back();
4859 } else {
4860 Ptr0 = PointerOps[Order.front()];
4861 PtrN = PointerOps[Order.back()];
4862 }
4863 std::optional<int> Diff =
4864 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
4865 // Check that the sorted loads are consecutive.
4866 if (static_cast<unsigned>(*Diff) == Sz - 1)
4867 return LoadsState::Vectorize;
4868 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
4869 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
4870 return LoadsState::Gather;
4871 // Simple check if not a strided access - clear order.
4872 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
4873 // Try to generate strided load node if:
4874 // 1. Target with strided load support is detected.
4875 // 2. The number of loads is greater than MinProfitableStridedLoads,
4876 // or the potential stride <= MaxProfitableLoadStride and the
4877 // potential stride is power-of-2 (to avoid perf regressions for the very
4878 // small number of loads) and max distance > number of loads, or potential
4879 // stride is -1.
4880 // 3. The loads are ordered, or number of unordered loads <=
4881 // MaxProfitableUnorderedLoads, or loads are in reversed order.
4882 // (this check is to avoid extra costs for very expensive shuffles).
4883 // 4. Any pointer operand is an instruction with the users outside of the
4884 // current graph (for masked gathers extra extractelement instructions
4885 // might be required).
4886 auto IsAnyPointerUsedOutGraph =
4887 IsPossibleStrided && any_of(PointerOps, [&](Value *V) {
4888 return isa<Instruction>(V) && any_of(V->users(), [&](User *U) {
4889 return !getTreeEntry(U) && !MustGather.contains(U);
4890 });
4891 });
4892 const unsigned AbsoluteDiff = std::abs(*Diff);
4893 if (IsPossibleStrided && (IsAnyPointerUsedOutGraph ||
4895 (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
4896 has_single_bit(AbsoluteDiff))) &&
4897 AbsoluteDiff > Sz) ||
4898 *Diff == -(static_cast<int>(Sz) - 1))) {
4899 int Stride = *Diff / static_cast<int>(Sz - 1);
4900 if (*Diff == Stride * static_cast<int>(Sz - 1)) {
4901 Align Alignment =
4902 cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
4903 ->getAlign();
4904 if (TTI->isLegalStridedLoadStore(VecTy, Alignment)) {
4905 // Iterate through all pointers and check if all distances are
4906 // unique multiple of Dist.
4907 SmallSet<int, 4> Dists;
4908 for (Value *Ptr : PointerOps) {
4909 int Dist = 0;
4910 if (Ptr == PtrN)
4911 Dist = *Diff;
4912 else if (Ptr != Ptr0)
4913 Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE);
4914 // If the strides are not the same or repeated, we can't
4915 // vectorize.
4916 if (((Dist / Stride) * Stride) != Dist ||
4917 !Dists.insert(Dist).second)
4918 break;
4919 }
4920 if (Dists.size() == Sz)
4922 }
4923 }
4924 }
4925 }
4926 // Correctly identify compare the cost of loads + shuffles rather than
4927 // strided/masked gather loads. Returns true if vectorized + shuffles
4928 // representation is better than just gather.
4929 auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment,
4930 unsigned *BestVF,
4931 bool ProfitableGatherPointers) {
4932 if (BestVF)
4933 *BestVF = 0;
4934 // Compare masked gather cost and loads + insert subvector costs.
4936 auto [ScalarGEPCost, VectorGEPCost] =
4937 getGEPCosts(TTI, PointerOps, PointerOps.front(),
4938 Instruction::GetElementPtr, CostKind, ScalarTy, VecTy);
4939 // Estimate the cost of masked gather GEP. If not a splat, roughly
4940 // estimate as a buildvector, otherwise estimate as splat.
4941 APInt DemandedElts = APInt::getAllOnes(VecTy->getNumElements());
4942 VectorType *PtrVecTy =
4943 getWidenedType(PointerOps.front()->getType()->getScalarType(),
4944 VecTy->getNumElements());
4945 if (static_cast<unsigned>(count_if(
4946 PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.size() - 1 ||
4947 any_of(PointerOps, [&](Value *V) {
4948 return getUnderlyingObject(V) !=
4949 getUnderlyingObject(PointerOps.front());
4950 }))
4951 VectorGEPCost += TTI.getScalarizationOverhead(
4952 PtrVecTy, DemandedElts, /*Insert=*/true, /*Extract=*/false, CostKind);
4953 else
4954 VectorGEPCost +=
4956 PtrVecTy, APInt::getOneBitSet(VecTy->getNumElements(), 0),
4957 /*Insert=*/true, /*Extract=*/false, CostKind) +
4959 // The cost of scalar loads.
4960 InstructionCost ScalarLoadsCost =
4961 std::accumulate(VL.begin(), VL.end(), InstructionCost(),
4962 [&](InstructionCost C, Value *V) {
4963 return C + TTI.getInstructionCost(
4965 }) +
4966 ScalarGEPCost;
4967 // The cost of masked gather.
4968 InstructionCost MaskedGatherCost =
4970 Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),
4971 /*VariableMask=*/false, CommonAlignment, CostKind) +
4972 (ProfitableGatherPointers ? 0 : VectorGEPCost);
4973 InstructionCost GatherCost =
4974 TTI.getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
4975 /*Extract=*/false, CostKind) +
4976 ScalarLoadsCost;
4977 // The list of loads is small or perform partial check already - directly
4978 // compare masked gather cost and gather cost.
4979 constexpr unsigned ListLimit = 4;
4980 if (!TryRecursiveCheck || VL.size() < ListLimit)
4981 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
4982
4983 // FIXME: The following code has not been updated for non-power-of-2
4984 // vectors. The splitting logic here does not cover the original
4985 // vector if the vector factor is not a power of two. FIXME
4986 if (!has_single_bit(VL.size()))
4987 return false;
4988
4989 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
4990 unsigned MinVF = getMinVF(2 * Sz);
4991 DemandedElts.clearAllBits();
4992 // Iterate through possible vectorization factors and check if vectorized +
4993 // shuffles is better than just gather.
4994 for (unsigned VF = VL.size() / 2; VF >= MinVF; VF /= 2) {
4996 for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; Cnt += VF) {
4997 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
4999 SmallVector<Value *> PointerOps;
5000 LoadsState LS =
5001 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps, BestVF,
5002 /*TryRecursiveCheck=*/false);
5003 // Check that the sorted loads are consecutive.
5004 if (LS == LoadsState::Gather) {
5005 if (BestVF) {
5006 DemandedElts.setAllBits();
5007 break;
5008 }
5009 DemandedElts.setBits(Cnt, Cnt + VF);
5010 continue;
5011 }
5012 // If need the reorder - consider as high-cost masked gather for now.
5013 if ((LS == LoadsState::Vectorize ||
5015 !Order.empty() && !isReverseOrder(Order))
5017 States.push_back(LS);
5018 }
5019 if (DemandedElts.isAllOnes())
5020 // All loads gathered - try smaller VF.
5021 continue;
5022 // Can be vectorized later as a serie of loads/insertelements.
5023 InstructionCost VecLdCost = 0;
5024 if (!DemandedElts.isZero()) {
5025 VecLdCost =
5026 TTI.getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
5027 /*Extract=*/false, CostKind) +
5028 ScalarGEPCost;
5029 for (unsigned Idx : seq<unsigned>(VL.size()))
5030 if (DemandedElts[Idx])
5031 VecLdCost +=
5033 }
5034 auto *SubVecTy = getWidenedType(ScalarTy, VF);
5035 for (auto [I, LS] : enumerate(States)) {
5036 auto *LI0 = cast<LoadInst>(VL[I * VF]);
5037 InstructionCost VectorGEPCost =
5038 (LS == LoadsState::ScatterVectorize && ProfitableGatherPointers)
5039 ? 0
5040 : getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
5041 LI0->getPointerOperand(),
5042 Instruction::GetElementPtr, CostKind, ScalarTy,
5043 SubVecTy)
5044 .second;
5045 if (LS == LoadsState::ScatterVectorize) {
5046 if (static_cast<unsigned>(
5047 count_if(PointerOps, IsaPred<GetElementPtrInst>)) <
5048 PointerOps.size() - 1 ||
5049 any_of(PointerOps, [&](Value *V) {
5050 return getUnderlyingObject(V) !=
5051 getUnderlyingObject(PointerOps.front());
5052 }))
5053 VectorGEPCost += TTI.getScalarizationOverhead(
5054 SubVecTy, APInt::getAllOnes(VF),
5055 /*Insert=*/true, /*Extract=*/false, CostKind);
5056 else
5057 VectorGEPCost += TTI.getScalarizationOverhead(
5058 SubVecTy, APInt::getOneBitSet(VF, 0),
5059 /*Insert=*/true, /*Extract=*/false, CostKind) +
5061 {}, CostKind);
5062 }
5063 switch (LS) {
5065 VecLdCost +=
5066 TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(),
5067 LI0->getPointerAddressSpace(), CostKind,
5069 VectorGEPCost;
5070 break;
5072 VecLdCost += TTI.getStridedMemoryOpCost(Instruction::Load, SubVecTy,
5073 LI0->getPointerOperand(),
5074 /*VariableMask=*/false,
5075 CommonAlignment, CostKind) +
5076 VectorGEPCost;
5077 break;
5079 VecLdCost += TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy,
5080 LI0->getPointerOperand(),
5081 /*VariableMask=*/false,
5082 CommonAlignment, CostKind) +
5083 VectorGEPCost;
5084 break;
5085 case LoadsState::Gather:
5086 // Gathers are already calculated - ignore.
5087 continue;
5088 }
5089 SmallVector<int> ShuffleMask(VL.size());
5090 for (int Idx : seq<int>(0, VL.size()))
5091 ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
5092 if (I > 0)
5093 VecLdCost +=
5094 ::getShuffleCost(TTI, TTI::SK_InsertSubvector, VecTy, ShuffleMask,
5095 CostKind, I * VF, SubVecTy);
5096 }
5097 // If masked gather cost is higher - better to vectorize, so
5098 // consider it as a gather node. It will be better estimated
5099 // later.
5100 if (MaskedGatherCost >= VecLdCost &&
5101 VecLdCost - GatherCost < -SLPCostThreshold) {
5102 if (BestVF)
5103 *BestVF = VF;
5104 return true;
5105 }
5106 }
5107 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
5108 };
5109 // TODO: need to improve analysis of the pointers, if not all of them are
5110 // GEPs or have > 2 operands, we end up with a gather node, which just
5111 // increases the cost.
5112 Loop *L = LI->getLoopFor(cast<LoadInst>(VL0)->getParent());
5113 bool ProfitableGatherPointers =
5114 L && Sz > 2 && static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
5115 return L->isLoopInvariant(V);
5116 })) <= Sz / 2;
5117 if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted](Value *P) {
5119 return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) ||
5120 (GEP && GEP->getNumOperands() == 2 &&
5121 isa<Constant, Instruction>(GEP->getOperand(1)));
5122 })) {
5123 // Check if potential masked gather can be represented as series
5124 // of loads + insertsubvectors.
5125 // If masked gather cost is higher - better to vectorize, so
5126 // consider it as a gather node. It will be better estimated
5127 // later.
5128 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
5129 ProfitableGatherPointers))
5131 }
5132
5133 return LoadsState::Gather;
5134}
5135
5137 const DataLayout &DL, ScalarEvolution &SE,
5138 SmallVectorImpl<unsigned> &SortedIndices) {
5140 VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
5141 "Expected list of pointer operands.");
5142 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
5143 // Ptr into, sort and return the sorted indices with values next to one
5144 // another.
5146 Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U));
5147
5148 unsigned Cnt = 1;
5149 for (Value *Ptr : VL.drop_front()) {
5150 bool Found = any_of(Bases, [&](auto &Base) {
5151 std::optional<int> Diff =
5152 getPointersDiff(ElemTy, Base.first, ElemTy, Ptr, DL, SE,
5153 /*StrictCheck=*/true);
5154 if (!Diff)
5155 return false;
5156
5157 Base.second.emplace_back(Ptr, *Diff, Cnt++);
5158 return true;
5159 });
5160
5161 if (!Found) {
5162 // If we haven't found enough to usefully cluster, return early.
5163 if (Bases.size() > VL.size() / 2 - 1)
5164 return false;
5165
5166 // Not found already - add a new Base
5167 Bases[Ptr].emplace_back(Ptr, 0, Cnt++);
5168 }
5169 }
5170
5171 // For each of the bases sort the pointers by Offset and check if any of the
5172 // base become consecutively allocated.
5173 bool AnyConsecutive = false;
5174 for (auto &Base : Bases) {
5175 auto &Vec = Base.second;
5176 if (Vec.size() > 1) {
5177 llvm::stable_sort(Vec, [](const std::tuple<Value *, int, unsigned> &X,
5178 const std::tuple<Value *, int, unsigned> &Y) {
5179 return std::get<1>(X) < std::get<1>(Y);
5180 });
5181 int InitialOffset = std::get<1>(Vec[0]);
5182 AnyConsecutive |= all_of(enumerate(Vec), [InitialOffset](const auto &P) {
5183 return std::get<1>(P.value()) == int(P.index()) + InitialOffset;
5184 });
5185 }
5186 }
5187
5188 // Fill SortedIndices array only if it looks worth-while to sort the ptrs.
5189 SortedIndices.clear();
5190 if (!AnyConsecutive)
5191 return false;
5192
5193 // If we have a better order, also sort the base pointers by increasing
5194 // (variable) values if possible, to try and keep the order more regular. In
5195 // order to create a valid strict-weak order we cluster by the Root of gep
5196 // chains and sort within each.
5198 for (auto &Base : Bases) {
5199 Value *Strip = Base.first->stripInBoundsConstantOffsets();
5200 Value *Root = Strip;
5201 while (auto *Gep = dyn_cast<GetElementPtrInst>(Root))
5202 Root = Gep->getOperand(0);
5203 SortedBases.emplace_back(Base.first, Strip, Root);
5204 }
5205 auto *Begin = SortedBases.begin();
5206 auto *End = SortedBases.end();
5207 while (Begin != End) {
5208 Value *Root = std::get<2>(*Begin);
5209 auto *Mid = std::stable_partition(
5210 Begin, End, [&Root](auto V) { return std::get<2>(V) == Root; });
5212 for (auto *I = Begin; I < Mid; ++I)
5213 LessThan.try_emplace(std::get<1>(*I));
5214 for (auto *I = Begin; I < Mid; ++I) {
5215 Value *V = std::get<1>(*I);
5216 while (auto *Gep = dyn_cast<GetElementPtrInst>(V)) {
5217 V = Gep->getOperand(0);
5218 if (LessThan.contains(V))
5219 LessThan[V][std::get<1>(*I)] = true;
5220 }
5221 }
5222 std::stable_sort(Begin, Mid, [&LessThan](auto &V1, auto &V2) {
5223 return LessThan[std::get<1>(V1)][std::get<1>(V2)];
5224 });
5225 Begin = Mid;
5226 }
5227
5228 // Collect the final order of sorted indices
5229 for (auto Base : SortedBases)
5230 for (auto &T : Bases[std::get<0>(Base)])
5231 SortedIndices.push_back(std::get<2>(T));
5232
5233 assert(SortedIndices.size() == VL.size() &&
5234 "Expected SortedIndices to be the size of VL");
5235 return true;
5236}
5237
5238std::optional<BoUpSLP::OrdersType>
5239BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
5240 assert(TE.isGather() && "Expected gather node only.");
5241 Type *ScalarTy = TE.Scalars[0]->getType();
5242
5244 Ptrs.reserve(TE.Scalars.size());
5245 for (Value *V : TE.Scalars) {
5246 auto *L = dyn_cast<LoadInst>(V);
5247 if (!L || !L->isSimple())
5248 return std::nullopt;
5249 Ptrs.push_back(L->getPointerOperand());
5250 }
5251
5252 BoUpSLP::OrdersType Order;
5253 if (clusterSortPtrAccesses(Ptrs, ScalarTy, *DL, *SE, Order))
5254 return std::move(Order);
5255 return std::nullopt;
5256}
5257
5258/// Check if two insertelement instructions are from the same buildvector.
5261 function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
5262 // Instructions must be from the same basic blocks.
5263 if (VU->getParent() != V->getParent())
5264 return false;
5265 // Checks if 2 insertelements are from the same buildvector.
5266 if (VU->getType() != V->getType())
5267 return false;
5268 // Multiple used inserts are separate nodes.
5269 if (!VU->hasOneUse() && !V->hasOneUse())
5270 return false;
5271 auto *IE1 = VU;
5272 auto *IE2 = V;
5273 std::optional<unsigned> Idx1 = getElementIndex(IE1);
5274 std::optional<unsigned> Idx2 = getElementIndex(IE2);
5275 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
5276 return false;
5277 // Go through the vector operand of insertelement instructions trying to find
5278 // either VU as the original vector for IE2 or V as the original vector for
5279 // IE1.
5280 SmallBitVector ReusedIdx(
5281 cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue());
5282 bool IsReusedIdx = false;
5283 do {
5284 if (IE2 == VU && !IE1)
5285 return VU->hasOneUse();
5286 if (IE1 == V && !IE2)
5287 return V->hasOneUse();
5288 if (IE1 && IE1 != V) {
5289 unsigned Idx1 = getElementIndex(IE1).value_or(*Idx2);
5290 IsReusedIdx |= ReusedIdx.test(Idx1);
5291 ReusedIdx.set(Idx1);
5292 if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
5293 IE1 = nullptr;
5294 else
5295 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
5296 }
5297 if (IE2 && IE2 != VU) {
5298 unsigned Idx2 = getElementIndex(IE2).value_or(*Idx1);
5299 IsReusedIdx |= ReusedIdx.test(Idx2);
5300 ReusedIdx.set(Idx2);
5301 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
5302 IE2 = nullptr;
5303 else
5304 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
5305 }
5306 } while (!IsReusedIdx && (IE1 || IE2));
5307 return false;
5308}
5309
5310std::optional<BoUpSLP::OrdersType>
5311BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
5312 // No need to reorder if need to shuffle reuses, still need to shuffle the
5313 // node.
5314 if (!TE.ReuseShuffleIndices.empty()) {
5315 // FIXME: Support ReuseShuffleIndices for non-power-of-two vectors.
5316 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
5317 "Reshuffling scalars not yet supported for nodes with padding");
5318
5319 if (isSplat(TE.Scalars))
5320 return std::nullopt;
5321 // Check if reuse shuffle indices can be improved by reordering.
5322 // For this, check that reuse mask is "clustered", i.e. each scalar values
5323 // is used once in each submask of size <number_of_scalars>.
5324 // Example: 4 scalar values.
5325 // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
5326 // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
5327 // element 3 is used twice in the second submask.
5328 unsigned Sz = TE.Scalars.size();
5329 if (TE.isGather()) {
5330 if (std::optional<OrdersType> CurrentOrder =
5332 SmallVector<int> Mask;
5333 fixupOrderingIndices(*CurrentOrder);
5334 inversePermutation(*CurrentOrder, Mask);
5335 ::addMask(Mask, TE.ReuseShuffleIndices);
5336 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
5337 unsigned Sz = TE.Scalars.size();
5338 for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
5339 for (auto [I, Idx] : enumerate(ArrayRef(Mask).slice(K * Sz, Sz)))
5340 if (Idx != PoisonMaskElem)
5341 Res[Idx + K * Sz] = I + K * Sz;
5342 }
5343 return std::move(Res);
5344 }
5345 }
5346 if (Sz == 2 && TE.getVectorFactor() == 4 &&
5347 TTI->getNumberOfParts(getWidenedType(TE.Scalars.front()->getType(),
5348 2 * TE.getVectorFactor())) == 1)
5349 return std::nullopt;
5350 if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
5351 Sz)) {
5352 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
5353 if (TE.ReorderIndices.empty())
5354 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
5355 else
5356 inversePermutation(TE.ReorderIndices, ReorderMask);
5357 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
5358 unsigned VF = ReorderMask.size();
5359 OrdersType ResOrder(VF, VF);
5360 unsigned NumParts = divideCeil(VF, Sz);
5361 SmallBitVector UsedVals(NumParts);
5362 for (unsigned I = 0; I < VF; I += Sz) {
5363 int Val = PoisonMaskElem;
5364 unsigned UndefCnt = 0;
5365 unsigned Limit = std::min(Sz, VF - I);
5366 if (any_of(ArrayRef(ReorderMask).slice(I, Limit),
5367 [&](int Idx) {
5368 if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
5369 Val = Idx;
5370 if (Idx == PoisonMaskElem)
5371 ++UndefCnt;
5372 return Idx != PoisonMaskElem && Idx != Val;
5373 }) ||
5374 Val >= static_cast<int>(NumParts) || UsedVals.test(Val) ||
5375 UndefCnt > Sz / 2)
5376 return std::nullopt;
5377 UsedVals.set(Val);
5378 for (unsigned K = 0; K < NumParts; ++K) {
5379 unsigned Idx = Val + Sz * K;
5380 if (Idx < VF)
5381 ResOrder[Idx] = I + K;
5382 }
5383 }
5384 return std::move(ResOrder);
5385 }
5386 unsigned VF = TE.getVectorFactor();
5387 // Try build correct order for extractelement instructions.
5388 SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
5389 TE.ReuseShuffleIndices.end());
5390 if (TE.getOpcode() == Instruction::ExtractElement && !TE.isAltShuffle() &&
5391 all_of(TE.Scalars, [Sz](Value *V) {
5392 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
5393 return Idx && *Idx < Sz;
5394 })) {
5395 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
5396 if (TE.ReorderIndices.empty())
5397 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
5398 else
5399 inversePermutation(TE.ReorderIndices, ReorderMask);
5400 for (unsigned I = 0; I < VF; ++I) {
5401 int &Idx = ReusedMask[I];
5402 if (Idx == PoisonMaskElem)
5403 continue;
5404 Value *V = TE.Scalars[ReorderMask[Idx]];
5405 std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));
5406 Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI));
5407 }
5408 }
5409 // Build the order of the VF size, need to reorder reuses shuffles, they are
5410 // always of VF size.
5411 OrdersType ResOrder(VF);
5412 std::iota(ResOrder.begin(), ResOrder.end(), 0);
5413 auto *It = ResOrder.begin();
5414 for (unsigned K = 0; K < VF; K += Sz) {
5415 OrdersType CurrentOrder(TE.ReorderIndices);
5416 SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};
5417 if (SubMask.front() == PoisonMaskElem)
5418 std::iota(SubMask.begin(), SubMask.end(), 0);
5419 reorderOrder(CurrentOrder, SubMask);
5420 transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
5421 std::advance(It, Sz);
5422 }
5423 if (TE.isGather() && all_of(enumerate(ResOrder), [](const auto &Data) {
5424 return Data.index() == Data.value();
5425 }))
5426 return std::nullopt; // No need to reorder.
5427 return std::move(ResOrder);
5428 }
5429 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
5430 any_of(TE.UserTreeIndices,
5431 [](const EdgeInfo &EI) {
5432 return !Instruction::isBinaryOp(EI.UserTE->getOpcode());
5433 }) &&
5434 (TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
5435 return std::nullopt;
5436 if ((TE.State == TreeEntry::Vectorize ||
5437 TE.State == TreeEntry::StridedVectorize) &&
5439 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&
5440 !TE.isAltShuffle())
5441 return TE.ReorderIndices;
5442 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
5443 if (!TE.ReorderIndices.empty())
5444 return TE.ReorderIndices;
5445
5446 auto PHICompare = [&](unsigned I1, unsigned I2) {
5447 Value *V1 = TE.Scalars[I1];
5448 Value *V2 = TE.Scalars[I2];
5449 if (V1 == V2 || (V1->getNumUses() == 0 && V2->getNumUses() == 0))
5450 return false;
5451 if (V1->getNumUses() < V2->getNumUses())
5452 return true;
5453 if (V1->getNumUses() > V2->getNumUses())
5454 return false;
5455 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
5456 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
5457 if (auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1))
5458 if (auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2)) {
5460 IE1, IE2,
5461 [](InsertElementInst *II) { return II->getOperand(0); }))
5462 return I1 < I2;
5463 return getElementIndex(IE1) < getElementIndex(IE2);
5464 }
5465 if (auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1))
5466 if (auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2)) {
5467 if (EE1->getOperand(0) != EE2->getOperand(0))
5468 return I1 < I2;
5469 return getElementIndex(EE1) < getElementIndex(EE2);
5470 }
5471 return I1 < I2;
5472 };
5474 SmallVector<unsigned> Phis(TE.Scalars.size());
5475 std::iota(Phis.begin(), Phis.end(), 0);
5476 OrdersType ResOrder(TE.Scalars.size());
5477 for (unsigned Id = 0, Sz = TE.Scalars.size(); Id < Sz; ++Id)
5478 PhiToId[Id] = Id;
5479 stable_sort(Phis, PHICompare);
5480 for (unsigned Id = 0, Sz = Phis.size(); Id < Sz; ++Id)
5481 ResOrder[Id] = PhiToId[Phis[Id]];
5482 if (isIdentityOrder(ResOrder))
5483 return std::nullopt; // No need to reorder.
5484 return std::move(ResOrder);
5485 }
5486 if (TE.isGather() && !TE.isAltShuffle() && allSameType(TE.Scalars)) {
5487 // TODO: add analysis of other gather nodes with extractelement
5488 // instructions and other values/instructions, not only undefs.
5489 if ((TE.getOpcode() == Instruction::ExtractElement ||
5491 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
5492 all_of(TE.Scalars, [](Value *V) {
5493 auto *EE = dyn_cast<ExtractElementInst>(V);
5494 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
5495 })) {
5496 // Check that gather of extractelements can be represented as
5497 // just a shuffle of a single vector.
5498 OrdersType CurrentOrder;
5499 bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,
5500 /*ResizeAllowed=*/true);
5501 if (Reuse || !CurrentOrder.empty())
5502 return std::move(CurrentOrder);
5503 }
5504 // If the gather node is <undef, v, .., poison> and
5505 // insertelement poison, v, 0 [+ permute]
5506 // is cheaper than
5507 // insertelement poison, v, n - try to reorder.
5508 // If rotating the whole graph, exclude the permute cost, the whole graph
5509 // might be transformed.
5510 int Sz = TE.Scalars.size();
5511 if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&
5512 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
5513 const auto *It =
5514 find_if(TE.Scalars, [](Value *V) { return !isConstant(V); });
5515 if (It == TE.Scalars.begin())
5516 return OrdersType();
5517 auto *Ty = getWidenedType(TE.Scalars.front()->getType(), Sz);
5518 if (It != TE.Scalars.end()) {
5519 OrdersType Order(Sz, Sz);
5520 unsigned Idx = std::distance(TE.Scalars.begin(), It);
5521 Order[Idx] = 0;
5522 fixupOrderingIndices(Order);
5523 SmallVector<int> Mask;
5524 inversePermutation(Order, Mask);
5525 InstructionCost PermuteCost =
5526 TopToBottom
5527 ? 0
5529 InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
5530 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0,
5531 PoisonValue::get(Ty), *It);
5532 InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
5533 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
5534 PoisonValue::get(Ty), *It);
5535 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
5536 OrdersType Order(Sz, Sz);
5537 Order[Idx] = 0;
5538 return std::move(Order);
5539 }
5540 }
5541 }
5542 if (isSplat(TE.Scalars))
5543 return std::nullopt;
5544 if (TE.Scalars.size() >= 3)
5545 if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
5546 return Order;
5547 // Check if can include the order of vectorized loads. For masked gathers do
5548 // extra analysis later, so include such nodes into a special list.
5549 if (TE.isGather() && TE.getOpcode() == Instruction::Load) {
5550 SmallVector<Value *> PointerOps;
5551 OrdersType CurrentOrder;
5552 LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(),
5553 CurrentOrder, PointerOps);
5555 return std::move(CurrentOrder);
5556 }
5557 // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
5558 // has been auditted for correctness with non-power-of-two vectors.
5559 if (!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
5560 if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
5561 return CurrentOrder;
5562 }
5563 return std::nullopt;
5564}
5565
5566/// Checks if the given mask is a "clustered" mask with the same clusters of
5567/// size \p Sz, which are not identity submasks.
5569 unsigned Sz) {
5570 ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
5571 if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))
5572 return false;
5573 for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
5574 ArrayRef<int> Cluster = Mask.slice(I, Sz);
5575 if (Cluster != FirstCluster)
5576 return false;
5577 }
5578 return true;
5579}
5580
5581void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
5582 // Reorder reuses mask.
5583 reorderReuses(TE.ReuseShuffleIndices, Mask);
5584 const unsigned Sz = TE.Scalars.size();
5585 // For vectorized and non-clustered reused no need to do anything else.
5586 if (!TE.isGather() ||
5588 Sz) ||
5589 !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz))
5590 return;
5591 SmallVector<int> NewMask;
5592 inversePermutation(TE.ReorderIndices, NewMask);
5593 addMask(NewMask, TE.ReuseShuffleIndices);
5594 // Clear reorder since it is going to be applied to the new mask.
5595 TE.ReorderIndices.clear();
5596 // Try to improve gathered nodes with clustered reuses, if possible.
5597 ArrayRef<int> Slice = ArrayRef(NewMask).slice(0, Sz);
5598 SmallVector<unsigned> NewOrder(Slice);
5599 inversePermutation(NewOrder, NewMask);
5600 reorderScalars(TE.Scalars, NewMask);
5601 // Fill the reuses mask with the identity submasks.
5602 for (auto *It = TE.ReuseShuffleIndices.begin(),
5603 *End = TE.ReuseShuffleIndices.end();
5604 It != End; std::advance(It, Sz))
5605 std::iota(It, std::next(It, Sz), 0);
5606}
5607
5609 ArrayRef<unsigned> SecondaryOrder) {
5610 assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
5611 "Expected same size of orders");
5612 unsigned Sz = Order.size();
5613 SmallBitVector UsedIndices(Sz);
5614 for (unsigned Idx : seq<unsigned>(0, Sz)) {
5615 if (Order[Idx] != Sz)
5616 UsedIndices.set(Order[Idx]);
5617 }
5618 if (SecondaryOrder.empty()) {
5619 for (unsigned Idx : seq<unsigned>(0, Sz))
5620 if (Order[Idx] == Sz && !UsedIndices.test(Idx))
5621 Order[Idx] = Idx;
5622 } else {
5623 for (unsigned Idx : seq<unsigned>(0, Sz))
5624 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
5625 !UsedIndices.test(SecondaryOrder[Idx]))
5626 Order[Idx] = SecondaryOrder[Idx];
5627 }
5628}
5629
5631 // Maps VF to the graph nodes.
5633 // ExtractElement gather nodes which can be vectorized and need to handle
5634 // their ordering.
5636
5637 // Phi nodes can have preferred ordering based on their result users
5639
5640 // AltShuffles can also have a preferred ordering that leads to fewer
5641 // instructions, e.g., the addsub instruction in x86.
5642 DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
5643
5644 // Maps a TreeEntry to the reorder indices of external users.
5646 ExternalUserReorderMap;
5647 // Find all reorderable nodes with the given VF.
5648 // Currently the are vectorized stores,loads,extracts + some gathering of
5649 // extracts.
5650 for_each(VectorizableTree, [&, &TTIRef = *TTI](
5651 const std::unique_ptr<TreeEntry> &TE) {
5652 // Look for external users that will probably be vectorized.
5653 SmallVector<OrdersType, 1> ExternalUserReorderIndices =
5654 findExternalStoreUsersReorderIndices(TE.get());
5655 if (!ExternalUserReorderIndices.empty()) {
5656 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5657 ExternalUserReorderMap.try_emplace(TE.get(),
5658 std::move(ExternalUserReorderIndices));
5659 }
5660
5661 // Patterns like [fadd,fsub] can be combined into a single instruction in
5662 // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
5663 // to take into account their order when looking for the most used order.
5664 if (TE->isAltShuffle()) {
5665 VectorType *VecTy =
5666 getWidenedType(TE->Scalars[0]->getType(), TE->Scalars.size());
5667 unsigned Opcode0 = TE->getOpcode();
5668 unsigned Opcode1 = TE->getAltOpcode();
5669 SmallBitVector OpcodeMask(getAltInstrMask(TE->Scalars, Opcode0, Opcode1));
5670 // If this pattern is supported by the target then we consider the order.
5671 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
5672 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5673 AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
5674 }
5675 // TODO: Check the reverse order too.
5676 }
5677
5678 if (std::optional<OrdersType> CurrentOrder =
5679 getReorderingData(*TE, /*TopToBottom=*/true)) {
5680 // Do not include ordering for nodes used in the alt opcode vectorization,
5681 // better to reorder them during bottom-to-top stage. If follow the order
5682 // here, it causes reordering of the whole graph though actually it is
5683 // profitable just to reorder the subgraph that starts from the alternate
5684 // opcode vectorization node. Such nodes already end-up with the shuffle
5685 // instruction and it is just enough to change this shuffle rather than
5686 // rotate the scalars for the whole graph.
5687 unsigned Cnt = 0;
5688 const TreeEntry *UserTE = TE.get();
5689 while (UserTE && Cnt < RecursionMaxDepth) {
5690 if (UserTE->UserTreeIndices.size() != 1)
5691 break;
5692 if (all_of(UserTE->UserTreeIndices, [](const EdgeInfo &EI) {
5693 return EI.UserTE->State == TreeEntry::Vectorize &&
5694 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
5695 }))
5696 return;
5697 UserTE = UserTE->UserTreeIndices.back().UserTE;
5698 ++Cnt;
5699 }
5700 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5701 if (!(TE->State == TreeEntry::Vectorize ||
5702 TE->State == TreeEntry::StridedVectorize) ||
5703 !TE->ReuseShuffleIndices.empty())
5704 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
5705 if (TE->State == TreeEntry::Vectorize &&
5706 TE->getOpcode() == Instruction::PHI)
5707 PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
5708 }
5709 });
5710
5711 // Reorder the graph nodes according to their vectorization factor.
5712 for (unsigned VF = VectorizableTree.front()->getVectorFactor();
5713 !VFToOrderedEntries.empty() && VF > 1; VF -= 2 - (VF & 1U)) {
5714 auto It = VFToOrderedEntries.find(VF);
5715 if (It == VFToOrderedEntries.end())
5716 continue;
5717 // Try to find the most profitable order. We just are looking for the most
5718 // used order and reorder scalar elements in the nodes according to this
5719 // mostly used order.
5720 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
5721 // Delete VF entry upon exit.
5722 auto Cleanup = make_scope_exit([&]() { VFToOrderedEntries.erase(It); });
5723
5724 // All operands are reordered and used only in this node - propagate the
5725 // most used order to the user node.
5728 OrdersUses;
5730 for (const TreeEntry *OpTE : OrderedEntries) {
5731 // No need to reorder this nodes, still need to extend and to use shuffle,
5732 // just need to merge reordering shuffle and the reuse shuffle.
5733 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
5734 continue;
5735 // Count number of orders uses.
5736 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
5737 &PhisToOrders]() -> const OrdersType & {
5738 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
5739 auto It = GathersToOrders.find(OpTE);
5740 if (It != GathersToOrders.end())
5741 return It->second;
5742 }
5743 if (OpTE->isAltShuffle()) {
5744 auto It = AltShufflesToOrders.find(OpTE);
5745 if (It != AltShufflesToOrders.end())
5746 return It->second;
5747 }
5748 if (OpTE->State == TreeEntry::Vectorize &&
5749 OpTE->getOpcode() == Instruction::PHI) {
5750 auto It = PhisToOrders.find(OpTE);
5751 if (It != PhisToOrders.end())
5752 return It->second;
5753 }
5754 return OpTE->ReorderIndices;
5755 }();
5756 // First consider the order of the external scalar users.
5757 auto It = ExternalUserReorderMap.find(OpTE);
5758 if (It != ExternalUserReorderMap.end()) {
5759 const auto &ExternalUserReorderIndices = It->second;
5760 // If the OpTE vector factor != number of scalars - use natural order,
5761 // it is an attempt to reorder node with reused scalars but with
5762 // external uses.
5763 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
5764 OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second +=
5765 ExternalUserReorderIndices.size();
5766 } else {
5767 for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
5768 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
5769 }
5770 // No other useful reorder data in this entry.
5771 if (Order.empty())
5772 continue;
5773 }
5774 // Stores actually store the mask, not the order, need to invert.
5775 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
5776 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5777 SmallVector<int> Mask;
5778 inversePermutation(Order, Mask);
5779 unsigned E = Order.size();
5780 OrdersType CurrentOrder(E, E);
5781 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
5782 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5783 });
5784 fixupOrderingIndices(CurrentOrder);
5785 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
5786 } else {
5787 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
5788 }
5789 }
5790 if (OrdersUses.empty())
5791 continue;
5792 // Choose the most used order.
5793 unsigned IdentityCnt = 0;
5794 unsigned FilledIdentityCnt = 0;
5795 OrdersType IdentityOrder(VF, VF);
5796 for (auto &Pair : OrdersUses) {
5797 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
5798 if (!Pair.first.empty())
5799 FilledIdentityCnt += Pair.second;
5800 IdentityCnt += Pair.second;
5801 combineOrders(IdentityOrder, Pair.first);
5802 }
5803 }
5804 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
5805 unsigned Cnt = IdentityCnt;
5806 for (auto &Pair : OrdersUses) {
5807 // Prefer identity order. But, if filled identity found (non-empty order)
5808 // with same number of uses, as the new candidate order, we can choose
5809 // this candidate order.
5810 if (Cnt < Pair.second ||
5811 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
5812 Cnt == Pair.second && !BestOrder.empty() &&
5813 isIdentityOrder(BestOrder))) {
5814 combineOrders(Pair.first, BestOrder);
5815 BestOrder = Pair.first;
5816 Cnt = Pair.second;
5817 } else {
5818 combineOrders(BestOrder, Pair.first);
5819 }
5820 }
5821 // Set order of the user node.
5822 if (isIdentityOrder(BestOrder))
5823 continue;
5824 fixupOrderingIndices(BestOrder);
5825 SmallVector<int> Mask;
5826 inversePermutation(BestOrder, Mask);
5827 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
5828 unsigned E = BestOrder.size();
5829 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
5830 return I < E ? static_cast<int>(I) : PoisonMaskElem;
5831 });
5832 // Do an actual reordering, if profitable.
5833 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5834 // Just do the reordering for the nodes with the given VF.
5835 if (TE->Scalars.size() != VF) {
5836 if (TE->ReuseShuffleIndices.size() == VF) {
5837 // Need to reorder the reuses masks of the operands with smaller VF to
5838 // be able to find the match between the graph nodes and scalar
5839 // operands of the given node during vectorization/cost estimation.
5840 assert(all_of(TE->UserTreeIndices,
5841 [VF, &TE](const EdgeInfo &EI) {
5842 return EI.UserTE->Scalars.size() == VF ||
5843 EI.UserTE->Scalars.size() ==
5844 TE->Scalars.size();
5845 }) &&
5846 "All users must be of VF size.");
5847 // Update ordering of the operands with the smaller VF than the given
5848 // one.
5849 reorderNodeWithReuses(*TE, Mask);
5850 }
5851 continue;
5852 }
5853 if ((TE->State == TreeEntry::Vectorize ||
5854 TE->State == TreeEntry::StridedVectorize) &&
5856 InsertElementInst>(TE->getMainOp()) &&
5857 !TE->isAltShuffle()) {
5858 // Build correct orders for extract{element,value}, loads and
5859 // stores.
5860 reorderOrder(TE->ReorderIndices, Mask);
5861 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
5862 TE->reorderOperands(Mask);
5863 } else {
5864 // Reorder the node and its operands.
5865 TE->reorderOperands(Mask);
5866 assert(TE->ReorderIndices.empty() &&
5867 "Expected empty reorder sequence.");
5868 reorderScalars(TE->Scalars, Mask);
5869 }
5870 if (!TE->ReuseShuffleIndices.empty()) {
5871 // Apply reversed order to keep the original ordering of the reused
5872 // elements to avoid extra reorder indices shuffling.
5873 OrdersType CurrentOrder;
5874 reorderOrder(CurrentOrder, MaskOrder);
5875 SmallVector<int> NewReuses;
5876 inversePermutation(CurrentOrder, NewReuses);
5877 addMask(NewReuses, TE->ReuseShuffleIndices);
5878 TE->ReuseShuffleIndices.swap(NewReuses);
5879 }
5880 }
5881 }
5882}
5883
5884bool BoUpSLP::canReorderOperands(
5885 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
5886 ArrayRef<TreeEntry *> ReorderableGathers,
5887 SmallVectorImpl<TreeEntry *> &GatherOps) {
5888 for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
5889 if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
5890 return OpData.first == I &&
5891 (OpData.second->State == TreeEntry::Vectorize ||
5892 OpData.second->State == TreeEntry::StridedVectorize);
5893 }))
5894 continue;
5895 if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
5896 // Do not reorder if operand node is used by many user nodes.
5897 if (any_of(TE->UserTreeIndices,
5898 [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
5899 return false;
5900 // Add the node to the list of the ordered nodes with the identity
5901 // order.
5902 Edges.emplace_back(I, TE);
5903 // Add ScatterVectorize nodes to the list of operands, where just
5904 // reordering of the scalars is required. Similar to the gathers, so
5905 // simply add to the list of gathered ops.
5906 // If there are reused scalars, process this node as a regular vectorize
5907 // node, just reorder reuses mask.
5908 if (TE->State != TreeEntry::Vectorize &&
5909 TE->State != TreeEntry::StridedVectorize &&
5910 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
5911 GatherOps.push_back(TE);
5912 continue;
5913 }
5914 TreeEntry *Gather = nullptr;
5915 if (count_if(ReorderableGathers,
5916 [&Gather, UserTE, I](TreeEntry *TE) {
5917 assert(TE->State != TreeEntry::Vectorize &&
5918 TE->State != TreeEntry::StridedVectorize &&
5919 "Only non-vectorized nodes are expected.");
5920 if (any_of(TE->UserTreeIndices,
5921 [UserTE, I](const EdgeInfo &EI) {
5922 return EI.UserTE == UserTE && EI.EdgeIdx == I;
5923 })) {
5924 assert(TE->isSame(UserTE->getOperand(I)) &&
5925 "Operand entry does not match operands.");
5926 Gather = TE;
5927 return true;
5928 }
5929 return false;
5930 }) > 1 &&
5931 !allConstant(UserTE->getOperand(I)))
5932 return false;
5933 if (Gather)
5934 GatherOps.push_back(Gather);
5935 }
5936 return true;
5937}
5938
5939void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
5940 SetVector<TreeEntry *> OrderedEntries;
5941 DenseSet<const TreeEntry *> GathersToOrders;
5942 // Find all reorderable leaf nodes with the given VF.
5943 // Currently the are vectorized loads,extracts without alternate operands +
5944 // some gathering of extracts.
5945 SmallVector<TreeEntry *> NonVectorized;
5946 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5947 if (TE->State != TreeEntry::Vectorize &&
5948 TE->State != TreeEntry::StridedVectorize)
5949 NonVectorized.push_back(TE.get());
5950 if (std::optional<OrdersType> CurrentOrder =
5951 getReorderingData(*TE, /*TopToBottom=*/false)) {
5952 OrderedEntries.insert(TE.get());
5953 if (!(TE->State == TreeEntry::Vectorize ||
5954 TE->State == TreeEntry::StridedVectorize) ||
5955 !TE->ReuseShuffleIndices.empty())
5956 GathersToOrders.insert(TE.get());
5957 }
5958 }
5959
5960 // 1. Propagate order to the graph nodes, which use only reordered nodes.
5961 // I.e., if the node has operands, that are reordered, try to make at least
5962 // one operand order in the natural order and reorder others + reorder the
5963 // user node itself.
5965 while (!OrderedEntries.empty()) {
5966 // 1. Filter out only reordered nodes.
5967 // 2. If the entry has multiple uses - skip it and jump to the next node.
5969 SmallVector<TreeEntry *> Filtered;
5970 for (TreeEntry *TE : OrderedEntries) {
5971 if (!(TE->State == TreeEntry::Vectorize ||
5972 TE->State == TreeEntry::StridedVectorize ||
5973 (TE->isGather() && GathersToOrders.contains(TE))) ||
5974 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
5975 !all_of(drop_begin(TE->UserTreeIndices),
5976 [TE](const EdgeInfo &EI) {
5977 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
5978 }) ||
5979 !Visited.insert(TE).second) {
5980 Filtered.push_back(TE);
5981 continue;
5982 }
5983 // Build a map between user nodes and their operands order to speedup
5984 // search. The graph currently does not provide this dependency directly.
5985 for (EdgeInfo &EI : TE->UserTreeIndices)
5986 Users[EI.UserTE].emplace_back(EI.EdgeIdx, TE);
5987 }
5988 // Erase filtered entries.
5989 for (TreeEntry *TE : Filtered)
5990 OrderedEntries.remove(TE);
5992 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
5993 UsersVec(Users.begin(), Users.end());
5994 sort(UsersVec, [](const auto &Data1, const auto &Data2) {
5995 return Data1.first->Idx > Data2.first->Idx;
5996 });
5997 for (auto &Data : UsersVec) {
5998 // Check that operands are used only in the User node.
5999 SmallVector<TreeEntry *> GatherOps;
6000 if (!canReorderOperands(Data.first, Data.second, NonVectorized,
6001 GatherOps)) {
6002 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
6003 OrderedEntries.remove(Op.second);
6004 continue;
6005 }
6006 // All operands are reordered and used only in this node - propagate the
6007 // most used order to the user node.
6010 OrdersUses;
6011 // Do the analysis for each tree entry only once, otherwise the order of
6012 // the same node my be considered several times, though might be not
6013 // profitable.
6016 for (const auto &Op : Data.second) {
6017 TreeEntry *OpTE = Op.second;
6018 if (!VisitedOps.insert(OpTE).second)
6019 continue;
6020 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
6021 continue;
6022 const auto Order = [&]() -> const OrdersType {
6023 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
6024 return getReorderingData(*OpTE, /*TopToBottom=*/false)
6025 .value_or(OrdersType(1));
6026 return OpTE->ReorderIndices;
6027 }();
6028 // The order is partially ordered, skip it in favor of fully non-ordered
6029 // orders.
6030 if (Order.size() == 1)
6031 continue;
6032 unsigned NumOps = count_if(
6033 Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
6034 return P.second == OpTE;
6035 });
6036 // Stores actually store the mask, not the order, need to invert.
6037 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
6038 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
6039 SmallVector<int> Mask;
6040 inversePermutation(Order, Mask);
6041 unsigned E = Order.size();
6042 OrdersType CurrentOrder(E, E);
6043 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
6044 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
6045 });
6046 fixupOrderingIndices(CurrentOrder);
6047 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
6048 NumOps;
6049 } else {
6050 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
6051 }
6052 auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
6053 const auto AllowsReordering = [&](const TreeEntry *TE) {
6054 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
6055 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
6056 (IgnoreReorder && TE->Idx == 0))
6057 return true;
6058 if (TE->isGather()) {
6059 if (GathersToOrders.contains(TE))
6060 return !getReorderingData(*TE, /*TopToBottom=*/false)
6061 .value_or(OrdersType(1))
6062 .empty();
6063 return true;
6064 }
6065 return false;
6066 };
6067 for (const EdgeInfo &EI : OpTE->UserTreeIndices) {
6068 TreeEntry *UserTE = EI.UserTE;
6069 if (!VisitedUsers.insert(UserTE).second)
6070 continue;
6071 // May reorder user node if it requires reordering, has reused
6072 // scalars, is an alternate op vectorize node or its op nodes require
6073 // reordering.
6074 if (AllowsReordering(UserTE))
6075 continue;
6076 // Check if users allow reordering.
6077 // Currently look up just 1 level of operands to avoid increase of
6078 // the compile time.
6079 // Profitable to reorder if definitely more operands allow
6080 // reordering rather than those with natural order.
6082 if (static_cast<unsigned>(count_if(
6083 Ops, [UserTE, &AllowsReordering](
6084 const std::pair<unsigned, TreeEntry *> &Op) {
6085 return AllowsReordering(Op.second) &&
6086 all_of(Op.second->UserTreeIndices,
6087 [UserTE](const EdgeInfo &EI) {
6088 return EI.UserTE == UserTE;
6089 });
6090 })) <= Ops.size() / 2)
6091 ++Res.first->second;
6092 }
6093 }
6094 if (OrdersUses.empty()) {
6095 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
6096 OrderedEntries.remove(Op.second);
6097 continue;
6098 }
6099 // Choose the most used order.
6100 unsigned IdentityCnt = 0;
6101 unsigned VF = Data.second.front().second->getVectorFactor();
6102 OrdersType IdentityOrder(VF, VF);
6103 for (auto &Pair : OrdersUses) {
6104 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
6105 IdentityCnt += Pair.second;
6106 combineOrders(IdentityOrder, Pair.first);
6107 }
6108 }
6109 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
6110 unsigned Cnt = IdentityCnt;
6111 for (auto &Pair : OrdersUses) {
6112 // Prefer identity order. But, if filled identity found (non-empty
6113 // order) with same number of uses, as the new candidate order, we can
6114 // choose this candidate order.
6115 if (Cnt < Pair.second) {
6116 combineOrders(Pair.first, BestOrder);
6117 BestOrder = Pair.first;
6118 Cnt = Pair.second;
6119 } else {
6120 combineOrders(BestOrder, Pair.first);
6121 }
6122 }
6123 // Set order of the user node.
6124 if (isIdentityOrder(BestOrder)) {
6125 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
6126 OrderedEntries.remove(Op.second);
6127 continue;
6128 }
6129 fixupOrderingIndices(BestOrder);
6130 // Erase operands from OrderedEntries list and adjust their orders.
6131 VisitedOps.clear();
6132 SmallVector<int> Mask;
6133 inversePermutation(BestOrder, Mask);
6134 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
6135 unsigned E = BestOrder.size();
6136 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
6137 return I < E ? static_cast<int>(I) : PoisonMaskElem;
6138 });
6139 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
6140 TreeEntry *TE = Op.second;
6141 OrderedEntries.remove(TE);
6142 if (!VisitedOps.insert(TE).second)
6143 continue;
6144 if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
6145 reorderNodeWithReuses(*TE, Mask);
6146 continue;
6147 }
6148 // Gathers are processed separately.
6149 if (TE->State != TreeEntry::Vectorize &&
6150 TE->State != TreeEntry::StridedVectorize &&
6151 (TE->State != TreeEntry::ScatterVectorize ||
6152 TE->ReorderIndices.empty()))
6153 continue;
6154 assert((BestOrder.size() == TE->ReorderIndices.size() ||
6155 TE->ReorderIndices.empty()) &&
6156 "Non-matching sizes of user/operand entries.");
6157 reorderOrder(TE->ReorderIndices, Mask);
6158 if (IgnoreReorder && TE == VectorizableTree.front().get())
6159 IgnoreReorder = false;
6160 }
6161 // For gathers just need to reorder its scalars.
6162 for (TreeEntry *Gather : GatherOps) {
6163 assert(Gather->ReorderIndices.empty() &&
6164 "Unexpected reordering of gathers.");
6165 if (!Gather->ReuseShuffleIndices.empty()) {
6166 // Just reorder reuses indices.
6167 reorderReuses(Gather->ReuseShuffleIndices, Mask);
6168 continue;
6169 }
6170 reorderScalars(Gather->Scalars, Mask);
6171 OrderedEntries.remove(Gather);
6172 }
6173 // Reorder operands of the user node and set the ordering for the user
6174 // node itself.
6175 if (Data.first->State != TreeEntry::Vectorize ||
6177 Data.first->getMainOp()) ||
6178 Data.first->isAltShuffle())
6179 Data.first->reorderOperands(Mask);
6180 if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
6181 Data.first->isAltShuffle() ||
6182 Data.first->State == TreeEntry::StridedVectorize) {
6183 reorderScalars(Data.first->Scalars, Mask);
6184 reorderOrder(Data.first->ReorderIndices, MaskOrder,
6185 /*BottomOrder=*/true);
6186 if (Data.first->ReuseShuffleIndices.empty() &&
6187 !Data.first->ReorderIndices.empty() &&
6188 !Data.first->isAltShuffle()) {
6189 // Insert user node to the list to try to sink reordering deeper in
6190 // the graph.
6191 OrderedEntries.insert(Data.first);
6192 }
6193 } else {
6194 reorderOrder(Data.first->ReorderIndices, Mask);
6195 }
6196 }
6197 }
6198 // If the reordering is unnecessary, just remove the reorder.
6199 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
6200 VectorizableTree.front()->ReuseShuffleIndices.empty())
6201 VectorizableTree.front()->ReorderIndices.clear();
6202}
6203
6204Instruction *BoUpSLP::getRootEntryInstruction(const TreeEntry &Entry) const {
6205 if ((Entry.getOpcode() == Instruction::Store ||
6206 Entry.getOpcode() == Instruction::Load) &&
6207 Entry.State == TreeEntry::StridedVectorize &&
6208 !Entry.ReorderIndices.empty() && isReverseOrder(Entry.ReorderIndices))
6209 return dyn_cast<Instruction>(Entry.Scalars[Entry.ReorderIndices.front()]);
6210 return dyn_cast<Instruction>(Entry.Scalars.front());
6211}
6212
6214 const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
6215 DenseMap<Value *, unsigned> ScalarToExtUses;
6216 // Collect the values that we need to extract from the tree.
6217 for (auto &TEPtr : VectorizableTree) {
6218 TreeEntry *Entry = TEPtr.get();
6219
6220 // No need to handle users of gathered values.
6221 if (Entry->isGather())
6222 continue;
6223
6224 // For each lane:
6225 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
6226 Value *Scalar = Entry->Scalars[Lane];
6227 if (!isa<Instruction>(Scalar))
6228 continue;
6229 // All uses must be replaced already? No need to do it again.
6230 auto It = ScalarToExtUses.find(Scalar);
6231 if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
6232 continue;
6233
6234 // Check if the scalar is externally used as an extra arg.
6235 const auto *ExtI = ExternallyUsedValues.find(Scalar);
6236 if (ExtI != ExternallyUsedValues.end()) {
6237 int FoundLane = Entry->findLaneForValue(Scalar);
6238 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
6239 << FoundLane << " from " << *Scalar << ".\n");
6240 ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
6241 ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
6242 continue;
6243 }
6244 for (User *U : Scalar->users()) {
6245 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
6246
6247 Instruction *UserInst = dyn_cast<Instruction>(U);
6248 if (!UserInst || isDeleted(UserInst))
6249 continue;
6250
6251 // Ignore users in the user ignore list.
6252 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
6253 continue;
6254
6255 // Skip in-tree scalars that become vectors
6256 if (TreeEntry *UseEntry = getTreeEntry(U)) {
6257 // Some in-tree scalars will remain as scalar in vectorized
6258 // instructions. If that is the case, the one in FoundLane will
6259 // be used.
6260 if (UseEntry->State == TreeEntry::ScatterVectorize ||
6262 Scalar, getRootEntryInstruction(*UseEntry), TLI)) {
6263 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
6264 << ".\n");
6265 assert(!UseEntry->isGather() && "Bad state");
6266 continue;
6267 }
6268 U = nullptr;
6269 if (It != ScalarToExtUses.end()) {
6270 ExternalUses[It->second].User = nullptr;
6271 break;
6272 }
6273 }
6274
6275 if (U && Scalar->hasNUsesOrMore(UsesLimit))
6276 U = nullptr;
6277 int FoundLane = Entry->findLaneForValue(Scalar);
6278 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
6279 << " from lane " << FoundLane << " from " << *Scalar
6280 << ".\n");
6281 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
6282 ExternalUses.emplace_back(Scalar, U, FoundLane);
6283 if (!U)
6284 break;
6285 }
6286 }
6287 }
6288}
6289
6291BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
6293 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
6294 Value *V = TE->Scalars[Lane];
6295 // Don't iterate over the users of constant data.
6296 if (isa<ConstantData>(V))
6297 continue;
6298 // To save compilation time we don't visit if we have too many users.
6299 if (V->hasNUsesOrMore(UsesLimit))
6300 break;
6301
6302 // Collect stores per pointer object.
6303 for (User *U : V->users()) {
6304 auto *SI = dyn_cast<StoreInst>(U);
6305 // Test whether we can handle the store. V might be a global, which could
6306 // be used in a different function.
6307 if (SI == nullptr || !SI->isSimple() || SI->getFunction() != F ||
6308 !isValidElementType(SI->getValueOperand()->getType()))
6309 continue;
6310 // Skip entry if already
6311 if (getTreeEntry(U))
6312 continue;
6313
6314 Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
6315 auto &StoresVec = PtrToStoresMap[Ptr];
6316 // For now just keep one store per pointer object per lane.
6317 // TODO: Extend this to support multiple stores per pointer per lane
6318 if (StoresVec.size() > Lane)
6319 continue;
6320 // Skip if in different BBs.
6321 if (!StoresVec.empty() &&
6322 SI->getParent() != StoresVec.back()->getParent())
6323 continue;
6324 // Make sure that the stores are of the same type.
6325 if (!StoresVec.empty() &&
6326 SI->getValueOperand()->getType() !=
6327 StoresVec.back()->getValueOperand()->getType())
6328 continue;
6329 StoresVec.push_back(SI);
6330 }
6331 }
6332 return PtrToStoresMap;
6333}
6334
6335bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
6336 OrdersType &ReorderIndices) const {
6337 // We check whether the stores in StoreVec can form a vector by sorting them
6338 // and checking whether they are consecutive.
6339
6340 // To avoid calling getPointersDiff() while sorting we create a vector of
6341 // pairs {store, offset from first} and sort this instead.
6342 SmallVector<std::pair<StoreInst *, int>> StoreOffsetVec(StoresVec.size());
6343 StoreInst *S0 = StoresVec[0];
6344 StoreOffsetVec[0] = {S0, 0};
6345 Type *S0Ty = S0->getValueOperand()->getType();
6346 Value *S0Ptr = S0->getPointerOperand();
6347 for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {
6348 StoreInst *SI = StoresVec[Idx];
6349 std::optional<int> Diff =
6350 getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),
6351 SI->getPointerOperand(), *DL, *SE,
6352 /*StrictCheck=*/true);
6353 // We failed to compare the pointers so just abandon this StoresVec.
6354 if (!Diff)
6355 return false;
6356 StoreOffsetVec[Idx] = {StoresVec[Idx], *Diff};
6357 }
6358
6359 // Sort the vector based on the pointers. We create a copy because we may
6360 // need the original later for calculating the reorder (shuffle) indices.
6361 stable_sort(StoreOffsetVec, [](const std::pair<StoreInst *, int> &Pair1,
6362 const std::pair<StoreInst *, int> &Pair2) {
6363 int Offset1 = Pair1.second;
6364 int Offset2 = Pair2.second;
6365 return Offset1 < Offset2;
6366 });
6367
6368 // Check if the stores are consecutive by checking if their difference is 1.
6369 for (unsigned Idx : seq<unsigned>(1, StoreOffsetVec.size()))
6370 if (StoreOffsetVec[Idx].second != StoreOffsetVec[Idx - 1].second + 1)
6371 return false;
6372
6373 // Calculate the shuffle indices according to their offset against the sorted
6374 // StoreOffsetVec.
6375 ReorderIndices.reserve(StoresVec.size());
6376 for (StoreInst *SI : StoresVec) {
6377 unsigned Idx = find_if(StoreOffsetVec,
6378 [SI](const std::pair<StoreInst *, int> &Pair) {
6379 return Pair.first == SI;
6380 }) -
6381 StoreOffsetVec.begin();
6382 ReorderIndices.push_back(Idx);
6383 }
6384 // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
6385 // reorderTopToBottom() and reorderBottomToTop(), so we are following the
6386 // same convention here.
6387 if (isIdentityOrder(ReorderIndices))
6388 ReorderIndices.clear();
6389
6390 return true;
6391}
6392
6393#ifndef NDEBUG
6395 for (unsigned Idx : Order)
6396 dbgs() << Idx << ", ";
6397 dbgs() << "\n";
6398}
6399#endif
6400
6402BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
6403 unsigned NumLanes = TE->Scalars.size();
6404
6406 collectUserStores(TE);
6407
6408 // Holds the reorder indices for each candidate store vector that is a user of
6409 // the current TreeEntry.
6410 SmallVector<OrdersType, 1> ExternalReorderIndices;
6411
6412 // Now inspect the stores collected per pointer and look for vectorization
6413 // candidates. For each candidate calculate the reorder index vector and push
6414 // it into `ExternalReorderIndices`
6415 for (const auto &Pair : PtrToStoresMap) {
6416 auto &StoresVec = Pair.second;
6417 // If we have fewer than NumLanes stores, then we can't form a vector.
6418 if (StoresVec.size() != NumLanes)
6419 continue;
6420
6421 // If the stores are not consecutive then abandon this StoresVec.
6422 OrdersType ReorderIndices;
6423 if (!canFormVector(StoresVec, ReorderIndices))
6424 continue;
6425
6426 // We now know that the scalars in StoresVec can form a vector instruction,
6427 // so set the reorder indices.
6428 ExternalReorderIndices.push_back(ReorderIndices);
6429 }
6430 return ExternalReorderIndices;
6431}
6432
6434 const SmallDenseSet<Value *> &UserIgnoreLst) {
6435 deleteTree();
6436 UserIgnoreList = &UserIgnoreLst;
6437 if (!allSameType(Roots))
6438 return;
6439 buildTree_rec(Roots, 0, EdgeInfo());
6440}
6441
6443 deleteTree();
6444 if (!allSameType(Roots))
6445 return;
6446 buildTree_rec(Roots, 0, EdgeInfo());
6447}
6448
6449/// Tries to find subvector of loads and builds new vector of only loads if can
6450/// be profitable.
6452 const BoUpSLP &R, ArrayRef<Value *> VL, const DataLayout &DL,
6454 SmallVectorImpl<SmallVector<std::pair<LoadInst *, int>>> &GatheredLoads,
6455 bool AddNew = true) {
6456 if (VL.empty())
6457 return;
6458 Type *ScalarTy = getValueType(VL.front());
6459 if (!isValidElementType(ScalarTy))
6460 return;
6461 const int NumScalars = VL.size();
6462 int NumParts = 1;
6463 if (NumScalars > 1) {
6464 auto *VecTy = getWidenedType(ScalarTy, NumScalars);
6465 NumParts = TTI.getNumberOfParts(VecTy);
6466 if (NumParts == 0 || NumParts >= NumScalars ||
6467 VecTy->getNumElements() % NumParts != 0 ||
6468 !hasFullVectorsOrPowerOf2(TTI, VecTy->getElementType(),
6469 VecTy->getNumElements() / NumParts))
6470 NumParts = 1;
6471 }
6472 unsigned VF = PowerOf2Ceil(NumScalars / NumParts);
6474 for (int I : seq<int>(NumParts)) {
6475 for (Value *V :
6476 VL.slice(I * VF, std::min<unsigned>(VF, VL.size() - I * VF))) {
6477 auto *LI = dyn_cast<LoadInst>(V);
6478 if (!LI)
6479 continue;
6480 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
6481 continue;
6482 bool IsFound = false;
6483 for (auto &Data : ClusteredLoads) {
6484 if (LI->getParent() != Data.front().first->getParent())
6485 continue;
6486 std::optional<int> Dist =
6487 getPointersDiff(LI->getType(), LI->getPointerOperand(),
6488 Data.front().first->getType(),
6489 Data.front().first->getPointerOperand(), DL, SE,
6490 /*StrictCheck=*/true);
6491 if (Dist && all_of(Data, [&](const std::pair<LoadInst *, int> &Pair) {
6492 IsFound |= Pair.first == LI;
6493 return IsFound || Pair.second != *Dist;
6494 })) {
6495 if (!IsFound)
6496 Data.emplace_back(LI, *Dist);
6497 IsFound = true;
6498 break;
6499 }
6500 }
6501 if (!IsFound)
6502 ClusteredLoads.emplace_back().emplace_back(LI, 0);
6503 }
6504 }
6505 auto FindMatchingLoads =
6508 &GatheredLoads,
6509 SetVector<unsigned> &ToAdd, SetVector<unsigned> &Repeated,
6510 int &Offset, unsigned &Start) {
6511 if (Loads.empty())
6512 return GatheredLoads.end();
6514 LoadInst *LI = Loads.front().first;
6515 for (auto [Idx, Data] : enumerate(GatheredLoads)) {
6516 if (Idx < Start)
6517 continue;
6518 ToAdd.clear();
6519 if (LI->getParent() != Data.front().first->getParent())
6520 continue;
6521 std::optional<int> Dist =
6523 Data.front().first->getType(),
6524 Data.front().first->getPointerOperand(), DL, SE,
6525 /*StrictCheck=*/true);
6526 if (Dist) {
6527 // Found matching gathered loads - check if all loads are unique or
6528 // can be effectively vectorized.
6529 unsigned NumUniques = 0;
6530 for (auto [Cnt, Pair] : enumerate(Loads)) {
6531 bool Used = any_of(
6532 Data, [&, &P = Pair](const std::pair<LoadInst *, int> &PD) {
6533 return PD.first == P.first;
6534 });
6535 if (!Used &&
6536 none_of(Data,
6537 [&, &P = Pair](const std::pair<LoadInst *, int> &PD) {
6538 return *Dist + P.second == PD.second;
6539 })) {
6540 ++NumUniques;
6541 ToAdd.insert(Cnt);
6542 } else if (Used) {
6543 Repeated.insert(Cnt);
6544 }
6545 }
6546 if (NumUniques > 0 &&
6547 (Loads.size() == NumUniques ||
6548 (Loads.size() - NumUniques >= 2 &&
6549 Loads.size() - NumUniques >= Loads.size() / 2 &&
6550 (has_single_bit(Data.size() + NumUniques) ||
6551 bit_ceil(Data.size()) <
6552 bit_ceil(Data.size() + NumUniques))))) {
6553 Offset = *Dist;
6554 Start = Idx + 1;
6555 return std::next(GatheredLoads.begin(), Idx);
6556 }
6557 }
6558 }
6559 ToAdd.clear();
6560 return GatheredLoads.end();
6561 };
6562 for (ArrayRef<std::pair<LoadInst *, int>> Data : ClusteredLoads) {
6563 unsigned Start = 0;
6564 SetVector<unsigned> ToAdd, LocalToAdd, Repeated;
6565 int Offset = 0;
6566 auto *It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated,
6567 Offset, Start);
6568 while (It != GatheredLoads.end()) {
6569 assert(!LocalToAdd.empty() && "Expected some elements to add.");
6570 for (unsigned Idx : LocalToAdd)
6571 It->emplace_back(Data[Idx].first, Data[Idx].second + Offset);
6572 ToAdd.insert(LocalToAdd.begin(), LocalToAdd.end());
6573 It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated, Offset,
6574 Start);
6575 }
6576 if (any_of(seq<unsigned>(Data.size()), [&](unsigned Idx) {
6577 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
6578 })) {
6579 auto AddNewLoads =
6581 for (unsigned Idx : seq<unsigned>(Data.size())) {
6582 if (ToAdd.contains(Idx) || Repeated.contains(Idx))
6583 continue;
6584 Loads.push_back(Data[Idx]);
6585 }
6586 };
6587 if (!AddNew) {
6588 LoadInst *LI = Data.front().first;
6589 It = find_if(
6590 GatheredLoads, [&](ArrayRef<std::pair<LoadInst *, int>> PD) {
6591 return PD.front().first->getParent() == LI->getParent() &&
6592 PD.front().first->getType() == LI->getType();
6593 });
6594 while (It != GatheredLoads.end()) {
6595 AddNewLoads(*It);
6596 It = std::find_if(
6597 std::next(It), GatheredLoads.end(),
6598 [&](ArrayRef<std::pair<LoadInst *, int>> PD) {
6599 return PD.front().first->getParent() == LI->getParent() &&
6600 PD.front().first->getType() == LI->getType();
6601 });
6602 }
6603 }
6604 GatheredLoads.emplace_back().append(Data.begin(), Data.end());
6605 AddNewLoads(GatheredLoads.emplace_back());
6606 }
6607 }
6608}
6609
6610void BoUpSLP::tryToVectorizeGatheredLoads(
6611 ArrayRef<SmallVector<std::pair<LoadInst *, int>>> GatheredLoads) {
6612 GatheredLoadsEntriesFirst = VectorizableTree.size();
6613
6614 // Sort loads by distance.
6615 auto LoadSorter = [](const std::pair<LoadInst *, int> &L1,
6616 const std::pair<LoadInst *, int> &L2) {
6617 return L1.second > L2.second;
6618 };
6619
6620 auto IsMaskedGatherSupported = [&](ArrayRef<LoadInst *> Loads) {
6621 ArrayRef<Value *> Values(reinterpret_cast<Value *const *>(Loads.begin()),
6622 Loads.size());
6623 Align Alignment = computeCommonAlignment<LoadInst>(Values);
6624 auto *Ty = getWidenedType(Loads.front()->getType(), Loads.size());
6625 return TTI->isLegalMaskedGather(Ty, Alignment) &&
6626 !TTI->forceScalarizeMaskedGather(Ty, Alignment);
6627 };
6628
6629 auto GetVectorizedRanges = [this](ArrayRef<LoadInst *> Loads,
6630 BoUpSLP::ValueSet &VectorizedLoads,
6631 SmallVectorImpl<LoadInst *> &NonVectorized,
6632 bool Final, unsigned MaxVF) {
6634 unsigned StartIdx = 0;
6635 SmallVector<int> CandidateVFs;
6636 if (VectorizeNonPowerOf2 && has_single_bit(MaxVF + 1))
6637 CandidateVFs.push_back(MaxVF);
6638 for (int NumElts = bit_floor(MaxVF); NumElts > 1; NumElts /= 2) {
6639 CandidateVFs.push_back(NumElts);
6640 if (VectorizeNonPowerOf2 && NumElts > 2)
6641 CandidateVFs.push_back(NumElts - 1);
6642 }
6643
6644 if (Final && CandidateVFs.empty())
6645 return Results;
6646
6647 unsigned BestVF = Final ? CandidateVFs.back() : 0;
6648 for (unsigned NumElts : CandidateVFs) {
6649 if (Final && NumElts > BestVF)
6650 continue;
6651 SmallVector<unsigned> MaskedGatherVectorized;
6652 for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt + NumElts <= E;
6653 ++Cnt) {
6654 ArrayRef<LoadInst *> Slice = ArrayRef(Loads).slice(Cnt, NumElts);
6655 if (VectorizedLoads.count(Slice.front()) ||
6656 VectorizedLoads.count(Slice.back()) ||
6658 continue;
6659 // Check if it is profitable to try vectorizing gathered loads. It is
6660 // profitable if we have more than 3 consecutive loads or if we have
6661 // less but all users are vectorized or deleted.
6662 bool AllowToVectorize =
6663 NumElts >= 3 ||
6664 any_of(ValueToGatherNodes.at(Slice.front()),
6665 [=](const TreeEntry *TE) {
6666 return TE->Scalars.size() == 2 &&
6667 ((TE->Scalars.front() == Slice.front() &&
6668 TE->Scalars.back() == Slice.back()) ||
6669 (TE->Scalars.front() == Slice.back() &&
6670 TE->Scalars.back() == Slice.front()));
6671 });
6672 // Check if it is profitable to vectorize 2-elements loads.
6673 if (NumElts == 2) {
6674 bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
6675 Slice.front()->getType(), ElementCount::getFixed(NumElts));
6676 auto CheckIfAllowed = [=](ArrayRef<LoadInst *> Slice) {
6677 for (LoadInst *LI : Slice) {
6678 // If single use/user - allow to vectorize.
6679 if (LI->hasOneUse())
6680 continue;
6681 // 1. Check if number of uses equals number of users.
6682 // 2. All users are deleted.
6683 // 3. The load broadcasts are not allowed or the load is not
6684 // broadcasted.
6685 if (std::distance(LI->user_begin(), LI->user_end()) !=
6686 LI->getNumUses())
6687 return false;
6688 if (!IsLegalBroadcastLoad)
6689 continue;
6690 if (LI->hasNUsesOrMore(UsesLimit))
6691 return false;
6692 for (User *U : LI->users()) {
6693 if (auto *UI = dyn_cast<Instruction>(U); UI && isDeleted(UI))
6694 continue;
6695 if (const TreeEntry *UTE = getTreeEntry(U)) {
6696 for (int I : seq<int>(UTE->getNumOperands())) {
6697 if (all_of(UTE->getOperand(I),
6698 [LI](Value *V) { return V == LI; }))
6699 // Found legal broadcast - do not vectorize.
6700 return false;
6701 }
6702 }
6703 }
6704 }
6705 return true;
6706 };
6707 AllowToVectorize = CheckIfAllowed(Slice);
6708 }
6709 if (AllowToVectorize) {
6710 SmallVector<Value *> PointerOps;
6711 OrdersType CurrentOrder;
6712 // Try to build vector load.
6713 ArrayRef<Value *> Values(
6714 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
6715 LoadsState LS = canVectorizeLoads(Values, Slice.front(), CurrentOrder,
6716 PointerOps, &BestVF);
6717 if (LS != LoadsState::Gather ||
6718 (BestVF > 1 && static_cast<unsigned>(NumElts) == 2 * BestVF)) {
6719 if (LS == LoadsState::ScatterVectorize) {
6720 if (MaskedGatherVectorized.empty() ||
6721 Cnt >= MaskedGatherVectorized.back() + NumElts)
6722 MaskedGatherVectorized.push_back(Cnt);
6723 continue;
6724 }
6725 if (LS != LoadsState::Gather) {
6726 Results.emplace_back(Values, LS);
6727 VectorizedLoads.insert(Slice.begin(), Slice.end());
6728 // If we vectorized initial block, no need to try to vectorize it
6729 // again.
6730 if (Cnt == StartIdx)
6731 StartIdx += NumElts;
6732 }
6733 // Check if the whole array was vectorized already - exit.
6734 if (StartIdx >= Loads.size())
6735 break;
6736 // Erase last masked gather candidate, if another candidate within
6737 // the range is found to be better.
6738 if (!MaskedGatherVectorized.empty() &&
6739 Cnt < MaskedGatherVectorized.back() + NumElts)
6740 MaskedGatherVectorized.pop_back();
6741 Cnt += NumElts - 1;
6742 continue;
6743 }
6744 }
6745 if (!AllowToVectorize || BestVF == 0)
6747 }
6748 // Mark masked gathers candidates as vectorized, if any.
6749 for (unsigned Cnt : MaskedGatherVectorized) {
6750 ArrayRef<LoadInst *> Slice = ArrayRef(Loads).slice(Cnt, NumElts);
6751 ArrayRef<Value *> Values(
6752 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
6753 Results.emplace_back(Values, LoadsState::ScatterVectorize);
6754 VectorizedLoads.insert(Slice.begin(), Slice.end());
6755 // If we vectorized initial block, no need to try to vectorize it again.
6756 if (Cnt == StartIdx)
6757 StartIdx += NumElts;
6758 }
6759 }
6760 for (LoadInst *LI : Loads) {
6761 if (!VectorizedLoads.contains(LI))
6762 NonVectorized.push_back(LI);
6763 }
6764 return Results;
6765 };
6766 auto ProcessGatheredLoads =
6768 bool Final = false) {
6769 SmallVector<LoadInst *> NonVectorized;
6770 for (ArrayRef<std::pair<LoadInst *, int>> LoadsDists : GatheredLoads) {
6771 if (LoadsDists.size() <= 1) {
6772 NonVectorized.push_back(LoadsDists.back().first);
6773 continue;
6774 }
6775 SmallVector<std::pair<LoadInst *, int>> LocalLoadsDists(LoadsDists);
6776 SmallVector<LoadInst *> OriginalLoads(LocalLoadsDists.size());
6777 transform(
6778 LoadsDists, OriginalLoads.begin(),
6779 [](const std::pair<LoadInst *, int> &L) { return L.first; });
6780 stable_sort(LocalLoadsDists, LoadSorter);
6782 unsigned MaxConsecutiveDistance = 0;
6783 unsigned CurrentConsecutiveDist = 1;
6784 int LastDist = LocalLoadsDists.front().second;
6785 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
6786 for (const std::pair<LoadInst *, int> &L : LocalLoadsDists) {
6787 if (getTreeEntry(L.first))
6788 continue;
6789 assert(LastDist >= L.second &&
6790 "Expected first distance always not less than second");
6791 if (static_cast<unsigned>(LastDist - L.second) ==
6792 CurrentConsecutiveDist) {
6793 ++CurrentConsecutiveDist;
6794 MaxConsecutiveDistance =
6795 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
6796 Loads.push_back(L.first);
6797 continue;
6798 }
6799 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
6800 !Loads.empty())
6801 Loads.pop_back();
6802 CurrentConsecutiveDist = 1;
6803 LastDist = L.second;
6804 Loads.push_back(L.first);
6805 }
6806 if (Loads.size() <= 1)
6807 continue;
6808 if (AllowMaskedGather)
6809 MaxConsecutiveDistance = Loads.size();
6810 else if (MaxConsecutiveDistance < 2)
6811 continue;
6812 BoUpSLP::ValueSet VectorizedLoads;
6813 SmallVector<LoadInst *> SortedNonVectorized;
6815 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
6816 Final, MaxConsecutiveDistance);
6817 if (!Results.empty() && !SortedNonVectorized.empty() &&
6818 OriginalLoads.size() == Loads.size() &&
6819 MaxConsecutiveDistance == Loads.size() &&
6821 [](const std::pair<ArrayRef<Value *>, LoadsState> &P) {
6822 return P.second == LoadsState::ScatterVectorize;
6823 })) {
6824 VectorizedLoads.clear();
6825 SmallVector<LoadInst *> UnsortedNonVectorized;
6827 UnsortedResults =
6828 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
6829 UnsortedNonVectorized, Final,
6830 OriginalLoads.size());
6831 if (SortedNonVectorized.size() >= UnsortedNonVectorized.size()) {
6832 SortedNonVectorized.swap(UnsortedNonVectorized);
6833 Results.swap(UnsortedResults);
6834 }
6835 }
6836 for (auto [Slice, _] : Results) {
6837 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize gathered loads ("
6838 << Slice.size() << ")\n");
6839 if (any_of(Slice, [&](Value *V) { return getTreeEntry(V); })) {
6840 for (Value *L : Slice)
6841 if (!getTreeEntry(L))
6842 SortedNonVectorized.push_back(cast<LoadInst>(L));
6843 continue;
6844 }
6845
6846 // Select maximum VF as a maximum of user gathered nodes and
6847 // distance between scalar loads in these nodes.
6848 unsigned MaxVF = Slice.size();
6849 unsigned UserMaxVF = 0;
6850 if (MaxVF == 2) {
6851 UserMaxVF = MaxVF;
6852 } else {
6853 std::optional<unsigned> CommonVF = 0;
6855 for (auto [Idx, V] : enumerate(Slice)) {
6856 for (const TreeEntry *E : ValueToGatherNodes.at(V)) {
6857 UserMaxVF = std::max<unsigned>(UserMaxVF, E->Scalars.size());
6858 unsigned Pos =
6859 EntryToPosition.try_emplace(E, Idx).first->second;
6860 UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);
6861 if (CommonVF) {
6862 if (*CommonVF == 0) {
6863 CommonVF = E->Scalars.size();
6864 continue;
6865 }
6866 if (*CommonVF != E->Scalars.size())
6867 CommonVF.reset();
6868 }
6869 }
6870 }
6871 // Try to build long masked gather loads.
6872 UserMaxVF = bit_ceil(UserMaxVF);
6873 }
6874 for (unsigned VF = MaxVF; VF >= 2; VF /= 2) {
6875 bool IsVectorized = true;
6876 for (unsigned I = 0, E = Slice.size(); I < E; I += VF) {
6877 ArrayRef<Value *> SubSlice =
6878 Slice.slice(I, std::min(VF, E - I));
6879 if (getTreeEntry(SubSlice.front()))
6880 continue;
6881 unsigned Sz = VectorizableTree.size();
6882 buildTree_rec(SubSlice, 0, EdgeInfo());
6883 if (Sz == VectorizableTree.size()) {
6884 IsVectorized = false;
6885 continue;
6886 }
6887 }
6888 if (IsVectorized)
6889 break;
6890 }
6891 }
6892 NonVectorized.append(SortedNonVectorized);
6893 }
6894 return NonVectorized;
6895 };
6896 SmallVector<LoadInst *> NonVectorized = ProcessGatheredLoads(GatheredLoads);
6897 if (!GatheredLoads.empty() && !NonVectorized.empty() &&
6898 std::accumulate(
6899 GatheredLoads.begin(), GatheredLoads.end(), 0u,
6900 [](unsigned S, ArrayRef<std::pair<LoadInst *, int>> LoadsDists) {
6901 return S + LoadsDists.size();
6902 }) != NonVectorized.size() &&
6903 IsMaskedGatherSupported(NonVectorized)) {
6905 for (LoadInst *LI : NonVectorized) {
6906 // Reinsert non-vectorized loads to other list of loads with the same
6907 // base pointers.
6908 gatherPossiblyVectorizableLoads(*this, LI, *DL, *SE, *TTI,
6909 FinalGatheredLoads,
6910 /*AddNew=*/false);
6911 }
6912 // Final attempt to vectorize non-vectorized loads.
6913 (void)ProcessGatheredLoads(FinalGatheredLoads, /*Final=*/true);
6914 }
6915 // If no new entries created, consider it as no gathered loads entries must be
6916 // handled.
6917 if (static_cast<unsigned>(GatheredLoadsEntriesFirst) ==
6918 VectorizableTree.size())
6919 GatheredLoadsEntriesFirst = NoGatheredLoads;
6920}
6921
6922/// \return true if the specified list of values has only one instruction that
6923/// requires scheduling, false otherwise.
6924#ifndef NDEBUG
6926 Value *NeedsScheduling = nullptr;
6927 for (Value *V : VL) {
6929 continue;
6930 if (!NeedsScheduling) {
6931 NeedsScheduling = V;
6932 continue;
6933 }
6934 return false;
6935 }
6936 return NeedsScheduling;
6937}
6938#endif
6939
6940/// Generates key/subkey pair for the given value to provide effective sorting
6941/// of the values and better detection of the vectorizable values sequences. The
6942/// keys/subkeys can be used for better sorting of the values themselves (keys)
6943/// and in values subgroups (subkeys).
6944static std::pair<size_t, size_t> generateKeySubkey(
6945 Value *V, const TargetLibraryInfo *TLI,
6946 function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
6947 bool AllowAlternate) {
6948 hash_code Key = hash_value(V->getValueID() + 2);
6949 hash_code SubKey = hash_value(0);
6950 // Sort the loads by the distance between the pointers.
6951 if (auto *LI = dyn_cast<LoadInst>(V)) {
6952 Key = hash_combine(LI->getType(), hash_value(Instruction::Load), Key);
6953 if (LI->isSimple())
6954 SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));
6955 else
6956 Key = SubKey = hash_value(LI);
6957 } else if (isVectorLikeInstWithConstOps(V)) {
6958 // Sort extracts by the vector operands.
6960 Key = hash_value(Value::UndefValueVal + 1);
6961 if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
6962 if (!isUndefVector(EI->getVectorOperand()).all() &&
6963 !isa<UndefValue>(EI->getIndexOperand()))
6964 SubKey = hash_value(EI->getVectorOperand());
6965 }
6966 } else if (auto *I = dyn_cast<Instruction>(V)) {
6967 // Sort other instructions just by the opcodes except for CMPInst.
6968 // For CMP also sort by the predicate kind.
6970 isValidForAlternation(I->getOpcode())) {
6971 if (AllowAlternate)
6972 Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);
6973 else
6974 Key = hash_combine(hash_value(I->getOpcode()), Key);
6975 SubKey = hash_combine(
6976 hash_value(I->getOpcode()), hash_value(I->getType()),
6978 ? I->getType()
6979 : cast<CastInst>(I)->getOperand(0)->getType()));
6980 // For casts, look through the only operand to improve compile time.
6981 if (isa<CastInst>(I)) {
6982 std::pair<size_t, size_t> OpVals =
6983 generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,
6984 /*AllowAlternate=*/true);
6985 Key = hash_combine(OpVals.first, Key);
6986 SubKey = hash_combine(OpVals.first, SubKey);
6987 }
6988 } else if (auto *CI = dyn_cast<CmpInst>(I)) {
6989 CmpInst::Predicate Pred = CI->getPredicate();
6990 if (CI->isCommutative())
6991 Pred = std::min(Pred, CmpInst::getInversePredicate(Pred));
6993 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred),
6994 hash_value(SwapPred),
6995 hash_value(CI->getOperand(0)->getType()));
6996 } else if (auto *Call = dyn_cast<CallInst>(I)) {
6998 if (isTriviallyVectorizable(ID)) {
6999 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));
7000 } else if (!VFDatabase(*Call).getMappings(*Call).empty()) {
7001 SubKey = hash_combine(hash_value(I->getOpcode()),
7003 } else {
7004 Key = hash_combine(hash_value(Call), Key);
7005 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));
7006 }
7008 SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),
7009 hash_value(Op.Tag), SubKey);
7010 } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
7011 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
7012 SubKey = hash_value(Gep->getPointerOperand());
7013 else
7014 SubKey = hash_value(Gep);
7015 } else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&
7016 !isa<ConstantInt>(I->getOperand(1))) {
7017 // Do not try to vectorize instructions with potentially high cost.
7018 SubKey = hash_value(I);
7019 } else {
7020 SubKey = hash_value(I->getOpcode());
7021 }
7022 Key = hash_combine(hash_value(I->getParent()), Key);
7023 }
7024 return std::make_pair(Key, SubKey);
7025}
7026
7027/// Checks if the specified instruction \p I is an alternate operation for
7028/// the given \p MainOp and \p AltOp instructions.
7029static bool isAlternateInstruction(const Instruction *I,
7030 const Instruction *MainOp,
7031 const Instruction *AltOp,
7032 const TargetLibraryInfo &TLI);
7033
7034bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
7035 ArrayRef<Value *> VL) const {
7036 unsigned Opcode0 = S.getOpcode();
7037 unsigned Opcode1 = S.getAltOpcode();
7038 SmallBitVector OpcodeMask(getAltInstrMask(VL, Opcode0, Opcode1));
7039 // If this pattern is supported by the target then consider it profitable.
7040 if (TTI->isLegalAltInstr(getWidenedType(S.MainOp->getType(), VL.size()),
7041 Opcode0, Opcode1, OpcodeMask))
7042 return true;
7044 for (unsigned I : seq<unsigned>(0, S.MainOp->getNumOperands())) {
7045 Operands.emplace_back();
7046 // Prepare the operand vector.
7047 for (Value *V : VL)
7048 Operands.back().push_back(cast<Instruction>(V)->getOperand(I));
7049 }
7050 if (Operands.size() == 2) {
7051 // Try find best operands candidates.
7052 for (unsigned I : seq<unsigned>(0, VL.size() - 1)) {
7054 Candidates[0] = std::make_pair(Operands[0][I], Operands[0][I + 1]);
7055 Candidates[1] = std::make_pair(Operands[0][I], Operands[1][I + 1]);
7056 Candidates[2] = std::make_pair(Operands[1][I], Operands[0][I + 1]);
7057 std::optional<int> Res = findBestRootPair(Candidates);
7058 switch (Res.value_or(0)) {
7059 case 0:
7060 break;
7061 case 1:
7062 std::swap(Operands[0][I + 1], Operands[1][I + 1]);
7063 break;
7064 case 2:
7065 std::swap(Operands[0][I], Operands[1][I]);
7066 break;
7067 default:
7068 llvm_unreachable("Unexpected index.");
7069 }
7070 }
7071 }
7072 DenseSet<unsigned> UniqueOpcodes;
7073 constexpr unsigned NumAltInsts = 3; // main + alt + shuffle.
7074 unsigned NonInstCnt = 0;
7075 // Estimate number of instructions, required for the vectorized node and for
7076 // the buildvector node.
7077 unsigned UndefCnt = 0;
7078 // Count the number of extra shuffles, required for vector nodes.
7079 unsigned ExtraShuffleInsts = 0;
7080 // Check that operands do not contain same values and create either perfect
7081 // diamond match or shuffled match.
7082 if (Operands.size() == 2) {
7083 // Do not count same operands twice.
7084 if (Operands.front() == Operands.back()) {
7085 Operands.erase(Operands.begin());
7086 } else if (!allConstant(Operands.front()) &&
7087 all_of(Operands.front(), [&](Value *V) {
7088 return is_contained(Operands.back(), V);
7089 })) {
7090 Operands.erase(Operands.begin());
7091 ++ExtraShuffleInsts;
7092 }
7093 }
7094 const Loop *L = LI->getLoopFor(S.MainOp->getParent());
7095 // Vectorize node, if:
7096 // 1. at least single operand is constant or splat.
7097 // 2. Operands have many loop invariants (the instructions are not loop
7098 // invariants).
7099 // 3. At least single unique operands is supposed to vectorized.
7100 return none_of(Operands,
7101 [&](ArrayRef<Value *> Op) {
7102 if (allConstant(Op) ||
7103 (!isSplat(Op) && allSameBlock(Op) && allSameType(Op) &&
7104 getSameOpcode(Op, *TLI).MainOp))
7105 return false;
7107 for (Value *V : Op) {
7109 getTreeEntry(V) || (L && L->isLoopInvariant(V))) {
7110 if (isa<UndefValue>(V))
7111 ++UndefCnt;
7112 continue;
7113 }
7114 auto Res = Uniques.try_emplace(V, 0);
7115 // Found first duplicate - need to add shuffle.
7116 if (!Res.second && Res.first->second == 1)
7117 ++ExtraShuffleInsts;
7118 ++Res.first->getSecond();
7119 if (auto *I = dyn_cast<Instruction>(V))
7120 UniqueOpcodes.insert(I->getOpcode());
7121 else if (Res.second)
7122 ++NonInstCnt;
7123 }
7124 return none_of(Uniques, [&](const auto &P) {
7125 return P.first->hasNUsesOrMore(P.second + 1) &&
7126 none_of(P.first->users(), [&](User *U) {
7127 return getTreeEntry(U) || Uniques.contains(U);
7128 });
7129 });
7130 }) ||
7131 // Do not vectorize node, if estimated number of vector instructions is
7132 // more than estimated number of buildvector instructions. Number of
7133 // vector operands is number of vector instructions + number of vector
7134 // instructions for operands (buildvectors). Number of buildvector
7135 // instructions is just number_of_operands * number_of_scalars.
7136 (UndefCnt < (VL.size() - 1) * S.MainOp->getNumOperands() &&
7137 (UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts +
7138 NumAltInsts) < S.MainOp->getNumOperands() * VL.size());
7139}
7140
7141BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
7142 InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
7143 OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) {
7144 assert(S.MainOp && "Expected instructions with same/alternate opcodes only.");
7145
7146 if (S.MainOp->getType()->isFloatingPointTy() &&
7148 auto *I = dyn_cast<Instruction>(V);
7149 return I && (I->isBinaryOp() || isa<CallInst>(I)) && !I->isFast();
7150 }))
7151 return TreeEntry::NeedToGather;
7152
7153 unsigned ShuffleOrOp =
7154 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
7155 auto *VL0 = cast<Instruction>(S.OpValue);
7156 switch (ShuffleOrOp) {
7157 case Instruction::PHI: {
7158 // Too many operands - gather, most probably won't be vectorized.
7159 if (VL0->getNumOperands() > MaxPHINumOperands)
7160 return TreeEntry::NeedToGather;
7161 // Check for terminator values (e.g. invoke).
7162 for (Value *V : VL)
7163 for (Value *Incoming : cast<PHINode>(V)->incoming_values()) {
7165 if (Term && Term->isTerminator()) {
7167 << "SLP: Need to swizzle PHINodes (terminator use).\n");
7168 return TreeEntry::NeedToGather;
7169 }
7170 }
7171
7172 return TreeEntry::Vectorize;
7173 }
7174 case Instruction::ExtractValue:
7175 case Instruction::ExtractElement: {
7176 bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
7177 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
7178 if (!has_single_bit(VL.size()))
7179 return TreeEntry::NeedToGather;
7180 if (Reuse || !CurrentOrder.empty())
7181 return TreeEntry::Vectorize;
7182 LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
7183 return TreeEntry::NeedToGather;
7184 }
7185 case Instruction::InsertElement: {
7186 // Check that we have a buildvector and not a shuffle of 2 or more
7187 // different vectors.
7188 ValueSet SourceVectors;
7189 for (Value *V : VL) {
7190 SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
7191 assert(getElementIndex(V) != std::nullopt &&
7192 "Non-constant or undef index?");
7193 }
7194
7195 if (count_if(VL, [&SourceVectors](Value *V) {
7196 return !SourceVectors.contains(V);
7197 }) >= 2) {
7198 // Found 2nd source vector - cancel.
7199 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
7200 "different source vectors.\n");
7201 return TreeEntry::NeedToGather;
7202 }
7203
7204 if (any_of(VL, [&SourceVectors](Value *V) {
7205 // The last InsertElement can have multiple uses.
7206 return SourceVectors.contains(V) && !V->hasOneUse();
7207 })) {
7208 assert(SLPReVec && "Only supported by REVEC.");
7209 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
7210 "multiple uses.\n");
7211 return TreeEntry::NeedToGather;
7212 }
7213
7214 return TreeEntry::Vectorize;
7215 }
7216 case Instruction::Load: {
7217 // Check that a vectorized load would load the same memory as a scalar
7218 // load. For example, we don't want to vectorize loads that are smaller
7219 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
7220 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
7221 // from such a struct, we read/write packed bits disagreeing with the
7222 // unvectorized version.
7223 switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps)) {
7225 return TreeEntry::Vectorize;
7227 return TreeEntry::ScatterVectorize;
7229 return TreeEntry::StridedVectorize;
7230 case LoadsState::Gather:
7231#ifndef NDEBUG
7232 Type *ScalarTy = VL0->getType();
7233 if (DL->getTypeSizeInBits(ScalarTy) !=
7234 DL->getTypeAllocSizeInBits(ScalarTy))
7235 LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
7236 else if (any_of(VL,
7237 [](Value *V) { return !cast<LoadInst>(V)->isSimple(); }))
7238 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
7239 else
7240 LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
7241#endif // NDEBUG
7243 return TreeEntry::NeedToGather;
7244 }
7245 llvm_unreachable("Unexpected state of loads");
7246 }
7247 case Instruction::ZExt:
7248 case Instruction::SExt:
7249 case Instruction::FPToUI:
7250 case Instruction::FPToSI:
7251 case Instruction::FPExt:
7252 case Instruction::PtrToInt:
7253 case Instruction::IntToPtr:
7254 case Instruction::SIToFP:
7255 case Instruction::UIToFP:
7256 case Instruction::Trunc:
7257 case Instruction::FPTrunc:
7258 case Instruction::BitCast: {
7259 Type *SrcTy = VL0->getOperand(0)->getType();
7260 for (Value *V : VL) {
7261 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
7262 if (Ty != SrcTy || !isValidElementType(Ty)) {
7263 LLVM_DEBUG(
7264 dbgs() << "SLP: Gathering casts with different src types.\n");
7265 return TreeEntry::NeedToGather;
7266 }
7267 }
7268 return TreeEntry::Vectorize;
7269 }
7270 case Instruction::ICmp:
7271 case Instruction::FCmp: {
7272 // Check that all of the compares have the same predicate.
7273 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
7275 Type *ComparedTy = VL0->getOperand(0)->getType();
7276 for (Value *V : VL) {
7278 if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
7279 Cmp->getOperand(0)->getType() != ComparedTy) {
7280 LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
7281 return TreeEntry::NeedToGather;
7282 }
7283 }
7284 return TreeEntry::Vectorize;
7285 }
7286 case Instruction::Select:
7287 case Instruction::FNeg:
7288 case Instruction::Add:
7289 case Instruction::FAdd:
7290 case Instruction::Sub:
7291 case Instruction::FSub:
7292 case Instruction::Mul:
7293 case Instruction::FMul:
7294 case Instruction::UDiv:
7295 case Instruction::SDiv:
7296 case Instruction::FDiv:
7297 case Instruction::URem:
7298 case Instruction::SRem:
7299 case Instruction::FRem:
7300 case Instruction::Shl:
7301 case Instruction::LShr:
7302 case Instruction::AShr:
7303 case Instruction::And:
7304 case Instruction::Or:
7305 case Instruction::Xor:
7306 case Instruction::Freeze:
7307 return TreeEntry::Vectorize;
7308 case Instruction::GetElementPtr: {
7309 // We don't combine GEPs with complicated (nested) indexing.
7310 for (Value *V : VL) {
7311 auto *I = dyn_cast<GetElementPtrInst>(V);
7312 if (!I)
7313 continue;
7314 if (I->getNumOperands() != 2) {
7315 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
7316 return TreeEntry::NeedToGather;
7317 }
7318 }
7319
7320 // We can't combine several GEPs into one vector if they operate on
7321 // different types.
7322 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
7323 for (Value *V : VL) {
7324 auto *GEP = dyn_cast<GEPOperator>(V);
7325 if (!GEP)
7326 continue;
7327 Type *CurTy = GEP->getSourceElementType();
7328 if (Ty0 != CurTy) {
7329 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
7330 return TreeEntry::NeedToGather;
7331 }
7332 }
7333
7334 // We don't combine GEPs with non-constant indexes.
7335 Type *Ty1 = VL0->getOperand(1)->getType();
7336 for (Value *V : VL) {
7337 auto *I = dyn_cast<GetElementPtrInst>(V);
7338 if (!I)
7339 continue;
7340 auto *Op = I->getOperand(1);
7341 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
7342 (Op->getType() != Ty1 &&
7343 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
7344 Op->getType()->getScalarSizeInBits() >
7345 DL->getIndexSizeInBits(
7346 V->getType()->getPointerAddressSpace())))) {
7347 LLVM_DEBUG(
7348 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
7349 return TreeEntry::NeedToGather;
7350 }
7351 }
7352
7353 return TreeEntry::Vectorize;
7354 }
7355 case Instruction::Store: {
7356 // Check if the stores are consecutive or if we need to swizzle them.
7357 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
7358 // Avoid types that are padded when being allocated as scalars, while
7359 // being packed together in a vector (such as i1).
7360 if (DL->getTypeSizeInBits(ScalarTy) !=
7361 DL->getTypeAllocSizeInBits(ScalarTy)) {
7362 LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
7363 return TreeEntry::NeedToGather;
7364 }
7365 // Make sure all stores in the bundle are simple - we can't vectorize
7366 // atomic or volatile stores.
7367 for (Value *V : VL) {
7368 auto *SI = cast<StoreInst>(V);
7369 if (!SI->isSimple()) {
7370 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
7371 return TreeEntry::NeedToGather;
7372 }
7373 PointerOps.push_back(SI->getPointerOperand());
7374 }
7375
7376 // Check the order of pointer operands.
7377 if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
7378 Value *Ptr0;
7379 Value *PtrN;
7380 if (CurrentOrder.empty()) {
7381 Ptr0 = PointerOps.front();
7382 PtrN = PointerOps.back();
7383 } else {
7384 Ptr0 = PointerOps[CurrentOrder.front()];
7385 PtrN = PointerOps[CurrentOrder.back()];
7386 }
7387 std::optional<int> Dist =
7388 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
7389 // Check that the sorted pointer operands are consecutive.
7390 if (static_cast<unsigned>(*Dist) == VL.size() - 1)
7391 return TreeEntry::Vectorize;
7392 }
7393
7394 LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
7395 return TreeEntry::NeedToGather;
7396 }
7397 case Instruction::Call: {
7398 // Check if the calls are all to the same vectorizable intrinsic or
7399 // library function.
7400 CallInst *CI = cast<CallInst>(VL0);
7402
7403 VFShape Shape = VFShape::get(
7404 CI->getFunctionType(),
7405 ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
7406 false /*HasGlobalPred*/);
7407 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
7408
7409 if (!VecFunc && !isTriviallyVectorizable(ID)) {
7410 LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
7411 return TreeEntry::NeedToGather;
7412 }
7413 Function *F = CI->getCalledFunction();
7414 unsigned NumArgs = CI->arg_size();
7415 SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);
7416 for (unsigned J = 0; J != NumArgs; ++J)
7418 ScalarArgs[J] = CI->getArgOperand(J);
7419 for (Value *V : VL) {
7420 CallInst *CI2 = dyn_cast<CallInst>(V);
7421 if (!CI2 || CI2->getCalledFunction() != F ||
7422 getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
7423 (VecFunc &&
7424 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
7426 LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
7427 << "\n");
7428 return TreeEntry::NeedToGather;
7429 }
7430 // Some intrinsics have scalar arguments and should be same in order for
7431 // them to be vectorized.
7432 for (unsigned J = 0; J != NumArgs; ++J) {
7434 Value *A1J = CI2->getArgOperand(J);
7435 if (ScalarArgs[J] != A1J) {
7437 << "SLP: mismatched arguments in call:" << *CI
7438 << " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
7439 return TreeEntry::NeedToGather;
7440 }
7441 }
7442 }
7443 // Verify that the bundle operands are identical between the two calls.
7444 if (CI->hasOperandBundles() &&
7445 !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
7446 CI->op_begin() + CI->getBundleOperandsEndIndex(),
7447 CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
7448 LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
7449 << "!=" << *V << '\n');
7450 return TreeEntry::NeedToGather;
7451 }
7452 }
7453
7454 return TreeEntry::Vectorize;
7455 }
7456 case Instruction::ShuffleVector: {
7457 if (!S.isAltShuffle()) {
7458 // REVEC can support non alternate shuffle.
7460 return TreeEntry::Vectorize;
7461 // If this is not an alternate sequence of opcode like add-sub
7462 // then do not vectorize this instruction.
7463 LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
7464 return TreeEntry::NeedToGather;
7465 }
7466 if (!SLPSkipEarlyProfitabilityCheck && !areAltOperandsProfitable(S, VL)) {
7467 LLVM_DEBUG(
7468 dbgs()
7469 << "SLP: ShuffleVector not vectorized, operands are buildvector and "
7470 "the whole alt sequence is not profitable.\n");
7471 return TreeEntry::NeedToGather;
7472 }
7473
7474 return TreeEntry::Vectorize;
7475 }
7476 default:
7477 LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
7478 return TreeEntry::NeedToGather;
7479 }
7480}
7481
7482namespace {
7483/// Allows to correctly handle operands of the phi nodes based on the \p Main
7484/// PHINode order of incoming basic blocks/values.
7485class PHIHandler {
7486 DominatorTree &DT;
7487 PHINode *Main = nullptr;
7490
7491public:
7492 PHIHandler() = delete;
7493 PHIHandler(DominatorTree &DT, PHINode *Main, ArrayRef<Value *> Phis)
7494 : DT(DT), Main(Main), Phis(Phis),
7495 Operands(Main->getNumIncomingValues(),
7496 SmallVector<Value *>(Phis.size(), nullptr)) {}
7497 void buildOperands() {
7498 constexpr unsigned FastLimit = 4;
7499 if (Main->getNumIncomingValues() <= FastLimit) {
7500 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
7501 BasicBlock *InBB = Main->getIncomingBlock(I);
7502 if (!DT.isReachableFromEntry(InBB)) {
7503 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
7504 continue;
7505 }
7506 // Prepare the operand vector.
7507 for (auto [Idx, V] : enumerate(Phis)) {
7508 auto *P = cast<PHINode>(V);
7509 if (P->getIncomingBlock(I) == InBB)
7510 Operands[I][Idx] = P->getIncomingValue(I);
7511 else
7512 Operands[I][Idx] = P->getIncomingValueForBlock(InBB);
7513 }
7514 }
7515 return;
7516 }
7518 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
7519 BasicBlock *InBB = Main->getIncomingBlock(I);
7520 if (!DT.isReachableFromEntry(InBB)) {
7521 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
7522 continue;
7523 }
7524 Blocks.try_emplace(InBB).first->second.push_back(I);
7525 }
7526 for (auto [Idx, V] : enumerate(Phis)) {
7527 auto *P = cast<PHINode>(V);
7528 for (unsigned I : seq<unsigned>(0, P->getNumIncomingValues())) {
7529 BasicBlock *InBB = P->getIncomingBlock(I);
7530 if (InBB == Main->getIncomingBlock(I)) {
7532 continue;
7533 Operands[I][Idx] = P->getIncomingValue(I);
7534 continue;
7535 }
7536 auto It = Blocks.find(InBB);
7537 if (It == Blocks.end())
7538 continue;
7539 Operands[It->second.front()][Idx] = P->getIncomingValue(I);
7540 }
7541 }
7542 for (const auto &P : Blocks) {
7543 if (P.getSecond().size() <= 1)
7544 continue;
7545 unsigned BasicI = P.getSecond().front();
7546 for (unsigned I : ArrayRef(P.getSecond()).drop_front()) {
7548 [&](const auto &Data) {
7549 return !Data.value() ||
7550 Data.value() == Operands[BasicI][Data.index()];
7551 }) &&
7552 "Expected empty operands list.");
7553 Operands[I] = Operands[BasicI];
7554 }
7555 }
7556 }
7557 ArrayRef<Value *> getOperands(unsigned I) const { return Operands[I]; }
7558};
7559} // namespace
7560
7561void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
7562 const EdgeInfo &UserTreeIdx) {
7563 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
7564
7565 SmallVector<int> ReuseShuffleIndices;
7566 SmallVector<Value *> UniqueValues;
7567 SmallVector<Value *> NonUniqueValueVL;
7568 auto TryToFindDuplicates = [&](const InstructionsState &S,
7569 bool DoNotFail = false) {
7570 // Check that every instruction appears once in this bundle.
7571 SmallDenseMap<Value *, unsigned, 16> UniquePositions(VL.size());
7572 for (Value *V : VL) {
7573 if (isConstant(V)) {
7574 ReuseShuffleIndices.emplace_back(
7575 isa<UndefValue>(V) ? PoisonMaskElem : UniqueValues.size());
7576 UniqueValues.emplace_back(V);
7577 continue;
7578 }
7579 auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
7580 ReuseShuffleIndices.emplace_back(Res.first->second);
7581 if (Res.second)
7582 UniqueValues.emplace_back(V);
7583 }
7584 size_t NumUniqueScalarValues = UniqueValues.size();
7585 bool IsFullVectors = hasFullVectorsOrPowerOf2(
7586 *TTI, UniqueValues.front()->getType(), NumUniqueScalarValues);
7587 if (NumUniqueScalarValues == VL.size() &&
7588 (VectorizeNonPowerOf2 || IsFullVectors)) {
7589 ReuseShuffleIndices.clear();
7590 } else {
7591 // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
7592 if ((UserTreeIdx.UserTE &&
7593 UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI)) ||
7594 !has_single_bit(VL.size())) {
7595 LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
7596 "for nodes with padding.\n");
7597 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
7598 return false;
7599 }
7600 LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
7601 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
7602 (UniquePositions.size() == 1 && all_of(UniqueValues, [](Value *V) {
7603 return isa<UndefValue>(V) || !isConstant(V);
7604 }))) {
7605 if (DoNotFail && UniquePositions.size() > 1 &&
7606 NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() &&
7607 all_of(UniqueValues, [=](Value *V) {
7608 return isa<ExtractElementInst>(V) ||
7609 areAllUsersVectorized(cast<Instruction>(V),
7610 UserIgnoreList);
7611 })) {
7612 // Find the number of elements, which forms full vectors.
7613 unsigned PWSz = getFullVectorNumberOfElements(
7614 *TTI, UniqueValues.front()->getType(), UniqueValues.size());
7615 if (PWSz == VL.size()) {
7616 ReuseShuffleIndices.clear();
7617 } else {
7618 NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end());
7619 NonUniqueValueVL.append(PWSz - UniqueValues.size(),
7620 UniqueValues.back());
7621 VL = NonUniqueValueVL;
7622 }
7623 return true;
7624 }
7625 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
7626 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
7627 return false;
7628 }
7629 VL = UniqueValues;
7630 }
7631 return true;
7632 };
7633
7634 InstructionsState S = getSameOpcode(VL, *TLI);
7635
7636 // Don't go into catchswitch blocks, which can happen with PHIs.
7637 // Such blocks can only have PHIs and the catchswitch. There is no
7638 // place to insert a shuffle if we need to, so just avoid that issue.
7639 if (S.MainOp &&
7640 isa<CatchSwitchInst>(S.MainOp->getParent()->getTerminator())) {
7641 LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
7642 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
7643 return;
7644 }
7645
7646 // Check if this is a duplicate of another entry.
7647 if (S.getOpcode()) {
7648 if (TreeEntry *E = getTreeEntry(S.OpValue)) {
7649 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
7650 if (GatheredLoadsEntriesFirst != NoGatheredLoads || !E->isSame(VL)) {
7651 auto It = MultiNodeScalars.find(S.OpValue);
7652 if (It != MultiNodeScalars.end()) {
7653 auto *TEIt = find_if(It->getSecond(),
7654 [&](TreeEntry *ME) { return ME->isSame(VL); });
7655 if (TEIt != It->getSecond().end())
7656 E = *TEIt;
7657 else
7658 E = nullptr;
7659 } else {
7660 E = nullptr;
7661 }
7662 }
7663 if (!E) {
7664 if (!doesNotNeedToBeScheduled(S.OpValue)) {
7665 LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
7666 if (TryToFindDuplicates(S))
7667 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
7668 ReuseShuffleIndices);
7669 return;
7670 }
7672 Nodes.insert(getTreeEntry(S.OpValue));
7673 for (const TreeEntry *E : MultiNodeScalars.lookup(S.OpValue))
7674 Nodes.insert(E);
7675 SmallPtrSet<Value *, 8> Values(VL.begin(), VL.end());
7676 if (any_of(Nodes, [&](const TreeEntry *E) {
7677 return all_of(E->Scalars,
7678 [&](Value *V) { return Values.contains(V); });
7679 })) {
7680 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
7681 if (TryToFindDuplicates(S))
7682 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
7683 ReuseShuffleIndices);
7684 return;
7685 }
7686 } else {
7687 // Record the reuse of the tree node. FIXME, currently this is only
7688 // used to properly draw the graph rather than for the actual
7689 // vectorization.
7690 E->UserTreeIndices.push_back(UserTreeIdx);
7691 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue
7692 << ".\n");
7693 return;
7694 }
7695 }
7696 }
7697
7698 // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
7699 // a load), in which case peek through to include it in the tree, without
7700 // ballooning over-budget.
7701 if (Depth >= RecursionMaxDepth &&
7702 !(S.MainOp && isa<Instruction>(S.MainOp) && S.MainOp == S.AltOp &&
7703 VL.size() >= 4 &&
7704 (match(S.MainOp, m_Load(m_Value())) || all_of(VL, [&S](const Value *I) {
7705 return match(I,
7707 cast<Instruction>(I)->getOpcode() ==
7708 cast<Instruction>(S.MainOp)->getOpcode();
7709 })))) {
7710 LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
7711 if (TryToFindDuplicates(S))
7712 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
7713 ReuseShuffleIndices);
7714 return;
7715 }
7716
7717 // Don't handle scalable vectors
7718 if (S.getOpcode() == Instruction::ExtractElement &&
7720 cast<ExtractElementInst>(S.OpValue)->getVectorOperandType())) {
7721 LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
7722 if (TryToFindDuplicates(S))
7723 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
7724 ReuseShuffleIndices);
7725 return;
7726 }
7727
7728 // Don't handle vectors.
7729 if (!SLPReVec && getValueType(S.OpValue)->isVectorTy()) {
7730 LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
7731 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
7732 return;
7733 }
7734
7735 // If all of the operands are identical or constant we have a simple solution.
7736 // If we deal with insert/extract instructions, they all must have constant
7737 // indices, otherwise we should gather them, not try to vectorize.
7738 // If alternate op node with 2 elements with gathered operands - do not
7739 // vectorize.
7740 auto &&NotProfitableForVectorization = [&S, this,
7742 if (!S.getOpcode() || !S.isAltShuffle() || VL.size() > 2)
7743 return false;
7744 if (VectorizableTree.size() < MinTreeSize)
7745 return false;
7746 if (Depth >= RecursionMaxDepth - 1)
7747 return true;
7748 // Check if all operands are extracts, part of vector node or can build a
7749 // regular vectorize node.
7750 SmallVector<unsigned, 2> InstsCount(VL.size(), 0);
7751 for (Value *V : VL) {
7752 auto *I = cast<Instruction>(V);
7753 InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) {
7754 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
7755 }));
7756 }
7757 bool IsCommutative = isCommutative(S.MainOp) || isCommutative(S.AltOp);
7758 if ((IsCommutative &&
7759 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
7760 (!IsCommutative &&
7761 all_of(InstsCount, [](unsigned ICnt) { return ICnt < 2; })))
7762 return true;
7763 assert(VL.size() == 2 && "Expected only 2 alternate op instructions.");
7765 auto *I1 = cast<Instruction>(VL.front());
7766 auto *I2 = cast<Instruction>(VL.back());
7767 for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op)
7768 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
7769 I2->getOperand(Op));
7770 if (static_cast<unsigned>(count_if(
7771 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
7773 })) >= S.MainOp->getNumOperands() / 2)
7774 return false;
7775 if (S.MainOp->getNumOperands() > 2)
7776 return true;
7777 if (IsCommutative) {
7778 // Check permuted operands.
7779 Candidates.clear();
7780 for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op)
7781 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
7782 I2->getOperand((Op + 1) % E));
7783 if (any_of(
7784 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
7786 }))
7787 return false;
7788 }
7789 return true;
7790 };
7791 SmallVector<unsigned> SortedIndices;
7792 BasicBlock *BB = nullptr;
7793 bool IsScatterVectorizeUserTE =
7794 UserTreeIdx.UserTE &&
7795 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
7796 bool AreAllSameBlock = S.getOpcode() && allSameBlock(VL);
7797 bool AreScatterAllGEPSameBlock =
7798 (IsScatterVectorizeUserTE && S.OpValue->getType()->isPointerTy() &&
7799 VL.size() > 2 &&
7800 all_of(VL,
7801 [&BB](Value *V) {
7802 auto *I = dyn_cast<GetElementPtrInst>(V);
7803 if (!I)
7804 return doesNotNeedToBeScheduled(V);
7805 if (!BB)
7806 BB = I->getParent();
7807 return BB == I->getParent() && I->getNumOperands() == 2;
7808 }) &&
7809 BB &&
7810 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
7811 SortedIndices));
7812 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
7813 if (!AreAllSameInsts || (!S.getOpcode() && allConstant(VL)) || isSplat(VL) ||
7815 S.OpValue) &&
7817 NotProfitableForVectorization(VL)) {
7818 LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n");
7819 if (TryToFindDuplicates(S))
7820 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
7821 ReuseShuffleIndices);
7822 return;
7823 }
7824
7825 // Don't vectorize ephemeral values.
7826 if (S.getOpcode() && !EphValues.empty()) {
7827 for (Value *V : VL) {
7828 if (EphValues.count(V)) {
7829 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
7830 << ") is ephemeral.\n");
7831 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
7832 return;
7833 }
7834 }
7835 }
7836
7837 // We now know that this is a vector of instructions of the same type from
7838 // the same block.
7839
7840 // Check that none of the instructions in the bundle are already in the tree.
7841 for (Value *V : VL) {
7842 if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||
7844 continue;
7845 if (getTreeEntry(V)) {
7846 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
7847 << ") is already in tree.\n");
7848 if (TryToFindDuplicates(S))
7849 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
7850 ReuseShuffleIndices);
7851 return;
7852 }
7853 }
7854
7855 // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
7856 if (UserIgnoreList && !UserIgnoreList->empty()) {
7857 for (Value *V : VL) {
7858 if (UserIgnoreList->contains(V)) {
7859 LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
7860 if (TryToFindDuplicates(S))
7861 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
7862 ReuseShuffleIndices);
7863 return;
7864 }
7865 }
7866 }
7867
7868 // Special processing for sorted pointers for ScatterVectorize node with
7869 // constant indeces only.
7870 if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
7871 assert(S.OpValue->getType()->isPointerTy() &&
7873 "Expected pointers only.");
7874 // Reset S to make it GetElementPtr kind of node.
7875 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
7876 assert(It != VL.end() && "Expected at least one GEP.");
7877 S = getSameOpcode(*It, *TLI);
7878 }
7879
7880 // Check that all of the users of the scalars that we want to vectorize are
7881 // schedulable.
7882 auto *VL0 = cast<Instruction>(S.OpValue);
7883 BB = VL0->getParent();
7884
7885 if (S.MainOp && !DT->isReachableFromEntry(BB)) {
7886 // Don't go into unreachable blocks. They may contain instructions with
7887 // dependency cycles which confuse the final scheduling.
7888 LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
7889 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
7890 return;
7891 }
7892
7893 // Check that every instruction appears once in this bundle.
7894 if (!TryToFindDuplicates(S, /*DoNotFail=*/true))
7895 return;
7896
7897 // Perform specific checks for each particular instruction kind.
7898 OrdersType CurrentOrder;
7899 SmallVector<Value *> PointerOps;
7900 TreeEntry::EntryState State = getScalarsVectorizationState(
7901 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
7902 if (State == TreeEntry::NeedToGather) {
7903 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
7904 ReuseShuffleIndices);
7905 return;
7906 }
7907
7908 auto &BSRef = BlocksSchedules[BB];
7909 if (!BSRef)
7910 BSRef = std::make_unique<BlockScheduling>(BB);
7911
7912 BlockScheduling &BS = *BSRef;
7913
7914 std::optional<ScheduleData *> Bundle =
7915 BS.tryScheduleBundle(UniqueValues, this, S);
7916#ifdef EXPENSIVE_CHECKS
7917 // Make sure we didn't break any internal invariants
7918 BS.verify();
7919#endif
7920 if (!Bundle) {
7921 LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
7922 assert((!BS.getScheduleData(VL0) ||
7923 !BS.getScheduleData(VL0)->isPartOfBundle()) &&
7924 "tryScheduleBundle should cancelScheduling on failure");
7925 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
7926 ReuseShuffleIndices);
7927 NonScheduledFirst.insert(VL.front());
7928 if (S.getOpcode() == Instruction::Load &&
7929 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
7931 return;
7932 }
7933 LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
7934
7935 unsigned ShuffleOrOp = S.isAltShuffle() ?
7936 (unsigned) Instruction::ShuffleVector : S.getOpcode();
7937 auto CreateOperandNodes = [&](TreeEntry *TE, const auto &Operands) {
7938 // Postpone PHI nodes creation
7939 SmallVector<unsigned> PHIOps;
7940 for (unsigned I : seq<unsigned>(Operands.size())) {
7942 if (Op.empty())
7943 continue;
7944 InstructionsState S = getSameOpcode(Op, *TLI);
7945 if (S.getOpcode() != Instruction::PHI || S.isAltShuffle())
7946 buildTree_rec(Op, Depth + 1, {TE, I});
7947 else
7948 PHIOps.push_back(I);
7949 }
7950 for (unsigned I : PHIOps)
7951 buildTree_rec(Operands[I], Depth + 1, {TE, I});
7952 };
7953 switch (ShuffleOrOp) {
7954 case Instruction::PHI: {
7955 auto *PH = cast<PHINode>(VL0);
7956
7957 TreeEntry *TE =
7958 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
7959 LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
7960
7961 // Keeps the reordered operands to avoid code duplication.
7962 PHIHandler Handler(*DT, PH, VL);
7963 Handler.buildOperands();
7964 for (unsigned I : seq<unsigned>(PH->getNumOperands()))
7965 TE->setOperand(I, Handler.getOperands(I));
7966 SmallVector<ArrayRef<Value *>> Operands(PH->getNumOperands());
7967 for (unsigned I : seq<unsigned>(PH->getNumOperands()))
7968 Operands[I] = Handler.getOperands(I);
7969 CreateOperandNodes(TE, Operands);
7970 return;
7971 }
7972 case Instruction::ExtractValue:
7973 case Instruction::ExtractElement: {
7974 if (CurrentOrder.empty()) {
7975 LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
7976 } else {
7977 LLVM_DEBUG({
7978 dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
7979 "with order";
7980 for (unsigned Idx : CurrentOrder)
7981 dbgs() << " " << Idx;
7982 dbgs() << "\n";
7983 });
7984 fixupOrderingIndices(CurrentOrder);
7985 }
7986 // Insert new order with initial value 0, if it does not exist,
7987 // otherwise return the iterator to the existing one.
7988 newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7989 ReuseShuffleIndices, CurrentOrder);
7990 // This is a special case, as it does not gather, but at the same time
7991 // we are not extending buildTree_rec() towards the operands.
7992 ValueList Op0;
7993 Op0.assign(VL.size(), VL0->getOperand(0));
7994 VectorizableTree.back()->setOperand(0, Op0);
7995 return;
7996 }
7997 case Instruction::InsertElement: {
7998 assert(ReuseShuffleIndices.empty() && "All inserts should be unique");
7999
8000 auto OrdCompare = [](const std::pair<int, int> &P1,
8001 const std::pair<int, int> &P2) {
8002 return P1.first > P2.first;
8003 };
8005 decltype(OrdCompare)>
8006 Indices(OrdCompare);
8007 for (int I = 0, E = VL.size(); I < E; ++I) {
8008 unsigned Idx = *getElementIndex(VL[I]);
8009 Indices.emplace(Idx, I);
8010 }
8011 OrdersType CurrentOrder(VL.size(), VL.size());
8012 bool IsIdentity = true;
8013 for (int I = 0, E = VL.size(); I < E; ++I) {
8014 CurrentOrder[Indices.top().second] = I;
8015 IsIdentity &= Indices.top().second == I;
8016 Indices.pop();
8017 }
8018 if (IsIdentity)
8019 CurrentOrder.clear();
8020 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8021 {}, CurrentOrder);
8022 LLVM_DEBUG(dbgs() << "SLP: added inserts bundle.\n");
8023
8024 TE->setOperandsInOrder();
8025 buildTree_rec(TE->getOperand(1), Depth + 1, {TE, 1});
8026 return;
8027 }
8028 case Instruction::Load: {
8029 // Check that a vectorized load would load the same memory as a scalar
8030 // load. For example, we don't want to vectorize loads that are smaller
8031 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
8032 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
8033 // from such a struct, we read/write packed bits disagreeing with the
8034 // unvectorized version.
8035 TreeEntry *TE = nullptr;
8036 fixupOrderingIndices(CurrentOrder);
8037 switch (State) {
8038 case TreeEntry::Vectorize:
8039 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8040 ReuseShuffleIndices, CurrentOrder);
8041 if (CurrentOrder.empty())
8042 LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
8043 else
8044 LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
8045 TE->setOperandsInOrder();
8046 break;
8047 case TreeEntry::StridedVectorize:
8048 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
8049 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
8050 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
8051 TE->setOperandsInOrder();
8052 LLVM_DEBUG(dbgs() << "SLP: added a vector of strided loads.\n");
8053 break;
8054 case TreeEntry::ScatterVectorize:
8055 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
8056 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
8057 UserTreeIdx, ReuseShuffleIndices);
8058 TE->setOperandsInOrder();
8059 buildTree_rec(PointerOps, Depth + 1, {TE, 0});
8060 LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
8061 break;
8062 case TreeEntry::CombinedVectorize:
8063 case TreeEntry::NeedToGather:
8064 llvm_unreachable("Unexpected loads state.");
8065 }
8066 return;
8067 }
8068 case Instruction::ZExt:
8069 case Instruction::SExt:
8070 case Instruction::FPToUI:
8071 case Instruction::FPToSI:
8072 case Instruction::FPExt:
8073 case Instruction::PtrToInt:
8074 case Instruction::IntToPtr:
8075 case Instruction::SIToFP:
8076 case Instruction::UIToFP:
8077 case Instruction::Trunc:
8078 case Instruction::FPTrunc:
8079 case Instruction::BitCast: {
8080 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
8081 std::make_pair(std::numeric_limits<unsigned>::min(),
8082 std::numeric_limits<unsigned>::max()));
8083 if (ShuffleOrOp == Instruction::ZExt ||
8084 ShuffleOrOp == Instruction::SExt) {
8085 CastMaxMinBWSizes = std::make_pair(
8086 std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
8087 PrevMaxBW),
8088 std::min<unsigned>(
8089 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
8090 PrevMinBW));
8091 } else if (ShuffleOrOp == Instruction::Trunc) {
8092 CastMaxMinBWSizes = std::make_pair(
8093 std::max<unsigned>(
8094 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
8095 PrevMaxBW),
8096 std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
8097 PrevMinBW));
8098 }
8099 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8100 ReuseShuffleIndices);
8101 LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
8102
8103 TE->setOperandsInOrder();
8104 for (unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
8105 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
8106 if (ShuffleOrOp == Instruction::Trunc) {
8107 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
8108 } else if (ShuffleOrOp == Instruction::SIToFP ||
8109 ShuffleOrOp == Instruction::UIToFP) {
8110 unsigned NumSignBits =
8111 ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT);
8112 if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
8113 APInt Mask = DB->getDemandedBits(OpI);
8114 NumSignBits = std::max(NumSignBits, Mask.countl_zero());
8115 }
8116 if (NumSignBits * 2 >=
8117 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
8118 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
8119 }
8120 return;
8121 }
8122 case Instruction::ICmp:
8123 case Instruction::FCmp: {
8124 // Check that all of the compares have the same predicate.
8125 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
8126 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8127 ReuseShuffleIndices);
8128 LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
8129
8131 if (cast<CmpInst>(VL0)->isCommutative()) {
8132 // Commutative predicate - collect + sort operands of the instructions
8133 // so that each side is more likely to have the same opcode.
8135 "Commutative Predicate mismatch");
8136 reorderInputsAccordingToOpcode(VL, Left, Right, *this);
8137 } else {
8138 // Collect operands - commute if it uses the swapped predicate.
8139 for (Value *V : VL) {
8140 auto *Cmp = cast<CmpInst>(V);
8141 Value *LHS = Cmp->getOperand(0);
8142 Value *RHS = Cmp->getOperand(1);
8143 if (Cmp->getPredicate() != P0)
8144 std::swap(LHS, RHS);
8145 Left.push_back(LHS);
8146 Right.push_back(RHS);
8147 }
8148 }
8149 TE->setOperand(0, Left);
8150 TE->setOperand(1, Right);
8151 buildTree_rec(Left, Depth + 1, {TE, 0});
8152 buildTree_rec(Right, Depth + 1, {TE, 1});
8153 if (ShuffleOrOp == Instruction::ICmp) {
8154 unsigned NumSignBits0 =
8155 ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT);
8156 if (NumSignBits0 * 2 >=
8157 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
8158 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
8159 unsigned NumSignBits1 =
8160 ComputeNumSignBits(VL0->getOperand(1), *DL, 0, AC, nullptr, DT);
8161 if (NumSignBits1 * 2 >=
8162 DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
8163 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
8164 }
8165 return;
8166 }
8167 case Instruction::Select:
8168 case Instruction::FNeg:
8169 case Instruction::Add:
8170 case Instruction::FAdd:
8171 case Instruction::Sub:
8172 case Instruction::FSub:
8173 case Instruction::Mul:
8174 case Instruction::FMul:
8175 case Instruction::UDiv:
8176 case Instruction::SDiv:
8177 case Instruction::FDiv:
8178 case Instruction::URem:
8179 case Instruction::SRem:
8180 case Instruction::FRem:
8181 case Instruction::Shl:
8182 case Instruction::LShr:
8183 case Instruction::AShr:
8184 case Instruction::And:
8185 case Instruction::Or:
8186 case Instruction::Xor:
8187 case Instruction::Freeze: {
8188 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8189 ReuseShuffleIndices);
8190 LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n");
8191
8192 // Sort operands of the instructions so that each side is more likely to
8193 // have the same opcode.
8194 if (isa<BinaryOperator>(VL0) && isCommutative(VL0)) {
8196 reorderInputsAccordingToOpcode(VL, Left, Right, *this);
8197 TE->setOperand(0, Left);
8198 TE->setOperand(1, Right);
8199 buildTree_rec(Left, Depth + 1, {TE, 0});
8200 buildTree_rec(Right, Depth + 1, {TE, 1});
8201 return;
8202 }
8203
8204 TE->setOperandsInOrder();
8205 for (unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
8206 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
8207 return;
8208 }
8209 case Instruction::GetElementPtr: {
8210 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8211 ReuseShuffleIndices);
8212 LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
8214 // Prepare the operand vector for pointer operands.
8215 for (Value *V : VL) {
8217 if (!GEP) {
8218 Operands.front().push_back(V);
8219 continue;
8220 }
8221 Operands.front().push_back(GEP->getPointerOperand());
8222 }
8223 TE->setOperand(0, Operands.front());
8224 // Need to cast all indices to the same type before vectorization to
8225 // avoid crash.
8226 // Required to be able to find correct matches between different gather
8227 // nodes and reuse the vectorized values rather than trying to gather them
8228 // again.
8229 int IndexIdx = 1;
8230 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
8231 Type *Ty = all_of(VL,
8232 [VL0Ty, IndexIdx](Value *V) {
8234 if (!GEP)
8235 return true;
8236 return VL0Ty == GEP->getOperand(IndexIdx)->getType();
8237 })
8238 ? VL0Ty
8239 : DL->getIndexType(cast<GetElementPtrInst>(VL0)
8240 ->getPointerOperandType()
8241 ->getScalarType());
8242 // Prepare the operand vector.
8243 for (Value *V : VL) {
8244 auto *I = dyn_cast<GetElementPtrInst>(V);
8245 if (!I) {
8246 Operands.back().push_back(
8247 ConstantInt::get(Ty, 0, /*isSigned=*/false));
8248 continue;
8249 }
8250 auto *Op = I->getOperand(IndexIdx);
8251 auto *CI = dyn_cast<ConstantInt>(Op);
8252 if (!CI)
8253 Operands.back().push_back(Op);
8254 else
8255 Operands.back().push_back(ConstantFoldIntegerCast(
8256 CI, Ty, CI->getValue().isSignBitSet(), *DL));
8257 }
8258 TE->setOperand(IndexIdx, Operands.back());
8259
8260 for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
8261 buildTree_rec(Operands[I], Depth + 1, {TE, I});
8262 return;
8263 }
8264 case Instruction::Store: {
8265 bool Consecutive = CurrentOrder.empty();
8266 if (!Consecutive)
8267 fixupOrderingIndices(CurrentOrder);
8268 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8269 ReuseShuffleIndices, CurrentOrder);
8270 TE->setOperandsInOrder();
8271 buildTree_rec(TE->getOperand(0), Depth + 1, {TE, 0});
8272 if (Consecutive)
8273 LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
8274 else
8275 LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");
8276 return;
8277 }
8278 case Instruction::Call: {
8279 // Check if the calls are all to the same vectorizable intrinsic or
8280 // library function.
8281 CallInst *CI = cast<CallInst>(VL0);
8283
8284 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8285 ReuseShuffleIndices);
8286 // Sort operands of the instructions so that each side is more likely to
8287 // have the same opcode.
8288 if (isCommutative(VL0)) {
8290 reorderInputsAccordingToOpcode(VL, Left, Right, *this);
8291 TE->setOperand(0, Left);
8292 TE->setOperand(1, Right);
8294 for (unsigned I : seq<unsigned>(2, CI->arg_size())) {
8295 Operands.emplace_back();
8297 continue;
8298 for (Value *V : VL) {
8299 auto *CI2 = cast<CallInst>(V);
8300 Operands.back().push_back(CI2->getArgOperand(I));
8301 }
8302 TE->setOperand(I, Operands.back());
8303 }
8304 buildTree_rec(Left, Depth + 1, {TE, 0});
8305 buildTree_rec(Right, Depth + 1, {TE, 1});
8306 for (unsigned I : seq<unsigned>(2, CI->arg_size())) {
8307 if (Operands[I - 2].empty())
8308 continue;
8309 buildTree_rec(Operands[I - 2], Depth + 1, {TE, I});
8310 }
8311 return;
8312 }
8313 TE->setOperandsInOrder();
8314 for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
8315 // For scalar operands no need to create an entry since no need to
8316 // vectorize it.
8318 continue;
8320 // Prepare the operand vector.
8321 for (Value *V : VL) {
8322 auto *CI2 = cast<CallInst>(V);
8323 Operands.push_back(CI2->getArgOperand(I));
8324 }
8325 buildTree_rec(Operands, Depth + 1, {TE, I});
8326 }
8327 return;
8328 }
8329 case Instruction::ShuffleVector: {
8330 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8331 ReuseShuffleIndices);
8332 LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
8333
8334 // Reorder operands if reordering would enable vectorization.
8335 auto *CI = dyn_cast<CmpInst>(VL0);
8336 if (isa<BinaryOperator>(VL0) || CI) {
8338 if (!CI || all_of(VL, [](Value *V) {
8339 return cast<CmpInst>(V)->isCommutative();
8340 })) {
8341 reorderInputsAccordingToOpcode(VL, Left, Right, *this);
8342 } else {
8343 auto *MainCI = cast<CmpInst>(S.MainOp);
8344 auto *AltCI = cast<CmpInst>(S.AltOp);
8345 CmpInst::Predicate MainP = MainCI->getPredicate();
8346 CmpInst::Predicate AltP = AltCI->getPredicate();
8347 assert(MainP != AltP &&
8348 "Expected different main/alternate predicates.");
8349 // Collect operands - commute if it uses the swapped predicate or
8350 // alternate operation.
8351 for (Value *V : VL) {
8352 auto *Cmp = cast<CmpInst>(V);
8353 Value *LHS = Cmp->getOperand(0);
8354 Value *RHS = Cmp->getOperand(1);
8355
8356 if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
8357 if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
8358 std::swap(LHS, RHS);
8359 } else {
8360 if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
8361 std::swap(LHS, RHS);
8362 }
8363 Left.push_back(LHS);
8364 Right.push_back(RHS);
8365 }
8366 }
8367 TE->setOperand(0, Left);
8368 TE->setOperand(1, Right);
8369 buildTree_rec(Left, Depth + 1, {TE, 0});
8370 buildTree_rec(Right, Depth + 1, {TE, 1});
8371 return;
8372 }
8373
8374 TE->setOperandsInOrder();
8375 for (unsigned I : seq<unsigned>(0, VL0->getNumOperands()))
8376 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
8377 return;
8378 }
8379 default:
8380 break;
8381 }
8382 llvm_unreachable("Unexpected vectorization of the instructions.");
8383}
8384
8386 unsigned N = 1;
8387 Type *EltTy = T;
8388
8390 if (EltTy->isEmptyTy())
8391 return 0;
8392 if (auto *ST = dyn_cast<StructType>(EltTy)) {
8393 // Check that struct is homogeneous.
8394 for (const auto *Ty : ST->elements())
8395 if (Ty != *ST->element_begin())
8396 return 0;
8397 N *= ST->getNumElements();
8398 EltTy = *ST->element_begin();
8399 } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
8400 N *= AT->getNumElements();
8401 EltTy = AT->getElementType();
8402 } else {
8403 auto *VT = cast<FixedVectorType>(EltTy);
8404 N *= VT->getNumElements();
8405 EltTy = VT->getElementType();
8406 }
8407 }
8408
8409 if (!isValidElementType(EltTy))
8410 return 0;
8411 uint64_t VTSize = DL->getTypeStoreSizeInBits(getWidenedType(EltTy, N));
8412 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
8413 VTSize != DL->getTypeStoreSizeInBits(T))
8414 return 0;
8415 return N;
8416}
8417
8418bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
8419 SmallVectorImpl<unsigned> &CurrentOrder,
8420 bool ResizeAllowed) const {
8422 assert(It != VL.end() && "Expected at least one extract instruction.");
8423 auto *E0 = cast<Instruction>(*It);
8424 assert(
8426 "Invalid opcode");
8427 // Check if all of the extracts come from the same vector and from the
8428 // correct offset.
8429 Value *Vec = E0->getOperand(0);
8430
8431 CurrentOrder.clear();
8432
8433 // We have to extract from a vector/aggregate with the same number of elements.
8434 unsigned NElts;
8435 if (E0->getOpcode() == Instruction::ExtractValue) {
8436 NElts = canMapToVector(Vec->getType());
8437 if (!NElts)
8438 return false;
8439 // Check if load can be rewritten as load of vector.
8440 LoadInst *LI = dyn_cast<LoadInst>(Vec);
8441 if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
8442 return false;
8443 } else {
8444 NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
8445 }
8446
8447 unsigned E = VL.size();
8448 if (!ResizeAllowed && NElts != E)
8449 return false;
8450 SmallVector<int> Indices(E, PoisonMaskElem);
8451 unsigned MinIdx = NElts, MaxIdx = 0;
8452 for (auto [I, V] : enumerate(VL)) {
8453 auto *Inst = dyn_cast<Instruction>(V);
8454 if (!Inst)
8455 continue;
8456 if (Inst->getOperand(0) != Vec)
8457 return false;
8458 if (auto *EE = dyn_cast<ExtractElementInst>(Inst))
8459 if (isa<UndefValue>(EE->getIndexOperand()))
8460 continue;
8461 std::optional<unsigned> Idx = getExtractIndex(Inst);
8462 if (!Idx)
8463 return false;
8464 const unsigned ExtIdx = *Idx;
8465 if (ExtIdx >= NElts)
8466 continue;
8467 Indices[I] = ExtIdx;
8468 if (MinIdx > ExtIdx)
8469 MinIdx = ExtIdx;
8470 if (MaxIdx < ExtIdx)
8471 MaxIdx = ExtIdx;
8472 }
8473 if (MaxIdx - MinIdx + 1 > E)
8474 return false;
8475 if (MaxIdx + 1 <= E)
8476 MinIdx = 0;
8477
8478 // Check that all of the indices extract from the correct offset.
8479 bool ShouldKeepOrder = true;
8480 // Assign to all items the initial value E + 1 so we can check if the extract
8481 // instruction index was used already.
8482 // Also, later we can check that all the indices are used and we have a
8483 // consecutive access in the extract instructions, by checking that no
8484 // element of CurrentOrder still has value E + 1.
8485 CurrentOrder.assign(E, E);
8486 for (unsigned I = 0; I < E; ++I) {
8487 if (Indices[I] == PoisonMaskElem)
8488 continue;
8489 const unsigned ExtIdx = Indices[I] - MinIdx;
8490 if (CurrentOrder[ExtIdx] != E) {
8491 CurrentOrder.clear();
8492 return false;
8493 }
8494 ShouldKeepOrder &= ExtIdx == I;
8495 CurrentOrder[ExtIdx] = I;
8496 }
8497 if (ShouldKeepOrder)
8498 CurrentOrder.clear();
8499
8500 return ShouldKeepOrder;
8501}
8502
8503bool BoUpSLP::areAllUsersVectorized(
8504 Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {
8505 return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) ||
8506 all_of(I->users(), [this](User *U) {
8507 return ScalarToTreeEntry.contains(U) ||
8508 isVectorLikeInstWithConstOps(U) ||
8509 (isa<ExtractElementInst>(U) && MustGather.contains(U));
8510 });
8511}
8512
8513static std::pair<InstructionCost, InstructionCost>
8516 ArrayRef<Type *> ArgTys) {
8518
8519 // Calculate the cost of the scalar and vector calls.
8520 FastMathFlags FMF;
8521 if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
8522 FMF = FPCI->getFastMathFlags();
8524 IntrinsicCostAttributes CostAttrs(ID, VecTy, Arguments, ArgTys, FMF,
8526 auto IntrinsicCost =
8528
8529 auto Shape = VFShape::get(CI->getFunctionType(),
8531 false /*HasGlobalPred*/);
8532 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
8533 auto LibCost = IntrinsicCost;
8534 if (!CI->isNoBuiltin() && VecFunc) {
8535 // Calculate the cost of the vector library call.
8536 // If the corresponding vector call is cheaper, return its cost.
8537 LibCost =
8538 TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput);
8539 }
8540 return {IntrinsicCost, LibCost};
8541}
8542
8543void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
8544 const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
8545 SmallVectorImpl<Value *> *OpScalars,
8546 SmallVectorImpl<Value *> *AltScalars) const {
8547 unsigned Sz = Scalars.size();
8548 Mask.assign(Sz, PoisonMaskElem);
8549 SmallVector<int> OrderMask;
8550 if (!ReorderIndices.empty())
8551 inversePermutation(ReorderIndices, OrderMask);
8552 for (unsigned I = 0; I < Sz; ++I) {
8553 unsigned Idx = I;
8554 if (!ReorderIndices.empty())
8555 Idx = OrderMask[I];
8556 auto *OpInst = cast<Instruction>(Scalars[Idx]);
8557 if (IsAltOp(OpInst)) {
8558 Mask[I] = Sz + Idx;
8559 if (AltScalars)
8560 AltScalars->push_back(OpInst);
8561 } else {
8562 Mask[I] = Idx;
8563 if (OpScalars)
8564 OpScalars->push_back(OpInst);
8565 }
8566 }
8567 if (!ReuseShuffleIndices.empty()) {
8568 SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
8569 transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](int Idx) {
8570 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
8571 });
8572 Mask.swap(NewMask);
8573 }
8574}
8575
8577 const Instruction *MainOp,
8578 const Instruction *AltOp,
8579 const TargetLibraryInfo &TLI) {
8580 if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
8581 auto *AltCI = cast<CmpInst>(AltOp);
8582 CmpInst::Predicate MainP = MainCI->getPredicate();
8583 CmpInst::Predicate AltP = AltCI->getPredicate();
8584 assert(MainP != AltP && "Expected different main/alternate predicates.");
8585 auto *CI = cast<CmpInst>(I);
8586 if (isCmpSameOrSwapped(MainCI, CI, TLI))
8587 return false;
8588 if (isCmpSameOrSwapped(AltCI, CI, TLI))
8589 return true;
8590 CmpInst::Predicate P = CI->getPredicate();
8592
8593 assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) &&
8594 "CmpInst expected to match either main or alternate predicate or "
8595 "their swap.");
8596 (void)AltP;
8597 return MainP != P && MainP != SwappedP;
8598 }
8599 return I->getOpcode() == AltOp->getOpcode();
8600}
8601
8602TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {
8603 assert(!Ops.empty());
8604 const auto *Op0 = Ops.front();
8605
8606 const bool IsConstant = all_of(Ops, [](Value *V) {
8607 // TODO: We should allow undef elements here
8608 return isConstant(V) && !isa<UndefValue>(V);
8609 });
8610 const bool IsUniform = all_of(Ops, [=](Value *V) {
8611 // TODO: We should allow undef elements here
8612 return V == Op0;
8613 });
8614 const bool IsPowerOfTwo = all_of(Ops, [](Value *V) {
8615 // TODO: We should allow undef elements here
8616 if (auto *CI = dyn_cast<ConstantInt>(V))
8617 return CI->getValue().isPowerOf2();
8618 return false;
8619 });
8620 const bool IsNegatedPowerOfTwo = all_of(Ops, [](Value *V) {
8621 // TODO: We should allow undef elements here
8622 if (auto *CI = dyn_cast<ConstantInt>(V))
8623 return CI->getValue().isNegatedPowerOf2();
8624 return false;
8625 });
8626
8628 if (IsConstant && IsUniform)
8630 else if (IsConstant)
8632 else if (IsUniform)
8634
8636 VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;
8637 VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;
8638
8639 return {VK, VP};
8640}
8641
8642namespace {
8643/// The base class for shuffle instruction emission and shuffle cost estimation.
8644class BaseShuffleAnalysis {
8645protected:
8646 Type *ScalarTy = nullptr;
8647
8648 BaseShuffleAnalysis(Type *ScalarTy) : ScalarTy(ScalarTy) {}
8649
8650 /// V is expected to be a vectorized value.
8651 /// When REVEC is disabled, there is no difference between VF and
8652 /// VNumElements.
8653 /// When REVEC is enabled, VF is VNumElements / ScalarTyNumElements.
8654 /// e.g., if ScalarTy is <4 x Ty> and V1 is <8 x Ty>, 2 is returned instead
8655 /// of 8.
8656 unsigned getVF(Value *V) const {
8657 assert(V && "V cannot be nullptr");
8658 assert(isa<FixedVectorType>(V->getType()) &&
8659 "V does not have FixedVectorType");
8660 assert(ScalarTy && "ScalarTy cannot be nullptr");
8661 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
8662 unsigned VNumElements =
8663 cast<FixedVectorType>(V->getType())->getNumElements();
8664 assert(VNumElements > ScalarTyNumElements &&
8665 "the number of elements of V is not large enough");
8666 assert(VNumElements % ScalarTyNumElements == 0 &&
8667 "the number of elements of V is not a vectorized value");
8668 return VNumElements / ScalarTyNumElements;
8669 }
8670
8671 /// Checks if the mask is an identity mask.
8672 /// \param IsStrict if is true the function returns false if mask size does
8673 /// not match vector size.
8674 static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
8675 bool IsStrict) {
8676 int Limit = Mask.size();
8677 int VF = VecTy->getNumElements();
8678 int Index = -1;
8679 if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, Limit))
8680 return true;
8681 if (!IsStrict) {
8682 // Consider extract subvector starting from index 0.
8684 Index == 0)
8685 return true;
8686 // All VF-size submasks are identity (e.g.
8687 // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
8688 if (Limit % VF == 0 && all_of(seq<int>(0, Limit / VF), [=](int Idx) {
8689 ArrayRef<int> Slice = Mask.slice(Idx * VF, VF);
8690 return all_of(Slice, [](int I) { return I == PoisonMaskElem; }) ||
8692 }))
8693 return true;
8694 }
8695 return false;
8696 }
8697
8698 /// Tries to combine 2 different masks into single one.
8699 /// \param LocalVF Vector length of the permuted input vector. \p Mask may
8700 /// change the size of the vector, \p LocalVF is the original size of the
8701 /// shuffled vector.
8702 static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
8703 ArrayRef<int> ExtMask) {
8704 unsigned VF = Mask.size();
8705 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
8706 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
8707 if (ExtMask[I] == PoisonMaskElem)
8708 continue;
8709 int MaskedIdx = Mask[ExtMask[I] % VF];
8710 NewMask[I] =
8711 MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
8712 }
8713 Mask.swap(NewMask);
8714 }
8715
8716 /// Looks through shuffles trying to reduce final number of shuffles in the
8717 /// code. The function looks through the previously emitted shuffle
8718 /// instructions and properly mark indices in mask as undef.
8719 /// For example, given the code
8720 /// \code
8721 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
8722 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
8723 /// \endcode
8724 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
8725 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
8726 /// <0, 1, 2, 3> for the shuffle.
8727 /// If 2 operands are of different size, the smallest one will be resized and
8728 /// the mask recalculated properly.
8729 /// For example, given the code
8730 /// \code
8731 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
8732 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
8733 /// \endcode
8734 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
8735 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
8736 /// <0, 1, 2, 3> for the shuffle.
8737 /// So, it tries to transform permutations to simple vector merge, if
8738 /// possible.
8739 /// \param V The input vector which must be shuffled using the given \p Mask.
8740 /// If the better candidate is found, \p V is set to this best candidate
8741 /// vector.
8742 /// \param Mask The input mask for the shuffle. If the best candidate is found
8743 /// during looking-through-shuffles attempt, it is updated accordingly.
8744 /// \param SinglePermute true if the shuffle operation is originally a
8745 /// single-value-permutation. In this case the look-through-shuffles procedure
8746 /// may look for resizing shuffles as the best candidates.
8747 /// \return true if the shuffle results in the non-resizing identity shuffle
8748 /// (and thus can be ignored), false - otherwise.
8749 static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask,
8750 bool SinglePermute) {
8751 Value *Op = V;
8752 ShuffleVectorInst *IdentityOp = nullptr;
8753 SmallVector<int> IdentityMask;
8754 while (auto *SV = dyn_cast<ShuffleVectorInst>(Op)) {
8755 // Exit if not a fixed vector type or changing size shuffle.
8756 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
8757 if (!SVTy)
8758 break;
8759 // Remember the identity or broadcast mask, if it is not a resizing
8760 // shuffle. If no better candidates are found, this Op and Mask will be
8761 // used in the final shuffle.
8762 if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) {
8763 if (!IdentityOp || !SinglePermute ||
8764 (isIdentityMask(Mask, SVTy, /*IsStrict=*/true) &&
8766 IdentityMask.size()))) {
8767 IdentityOp = SV;
8768 // Store current mask in the IdentityMask so later we did not lost
8769 // this info if IdentityOp is selected as the best candidate for the
8770 // permutation.
8771 IdentityMask.assign(Mask);
8772 }
8773 }
8774 // Remember the broadcast mask. If no better candidates are found, this Op
8775 // and Mask will be used in the final shuffle.
8776 // Zero splat can be used as identity too, since it might be used with
8777 // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
8778 // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
8779 // expensive, the analysis founds out, that the source vector is just a
8780 // broadcast, this original mask can be transformed to identity mask <0,
8781 // 1, 2, 3>.
8782 // \code
8783 // %0 = shuffle %v, poison, zeroinitalizer
8784 // %res = shuffle %0, poison, <3, 1, 2, 0>
8785 // \endcode
8786 // may be transformed to
8787 // \code
8788 // %0 = shuffle %v, poison, zeroinitalizer
8789 // %res = shuffle %0, poison, <0, 1, 2, 3>
8790 // \endcode
8791 if (SV->isZeroEltSplat()) {
8792 IdentityOp = SV;
8793 IdentityMask.assign(Mask);
8794 }
8795 int LocalVF = Mask.size();
8796 if (auto *SVOpTy =
8797 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
8798 LocalVF = SVOpTy->getNumElements();
8799 SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
8800 for (auto [Idx, I] : enumerate(Mask)) {
8801 if (I == PoisonMaskElem ||
8802 static_cast<unsigned>(I) >= SV->getShuffleMask().size())
8803 continue;
8804 ExtMask[Idx] = SV->getMaskValue(I);
8805 }
8806 bool IsOp1Undef =
8807 isUndefVector(SV->getOperand(0),
8808 buildUseMask(LocalVF, ExtMask, UseMask::FirstArg))
8809 .all();
8810 bool IsOp2Undef =
8811 isUndefVector(SV->getOperand(1),
8812 buildUseMask(LocalVF, ExtMask, UseMask::SecondArg))
8813 .all();
8814 if (!IsOp1Undef && !IsOp2Undef) {
8815 // Update mask and mark undef elems.
8816 for (int &I : Mask) {
8817 if (I == PoisonMaskElem)
8818 continue;
8819 if (SV->getMaskValue(I % SV->getShuffleMask().size()) ==
8821 I = PoisonMaskElem;
8822 }
8823 break;
8824 }
8825 SmallVector<int> ShuffleMask(SV->getShuffleMask());
8826 combineMasks(LocalVF, ShuffleMask, Mask);
8827 Mask.swap(ShuffleMask);
8828 if (IsOp2Undef)
8829 Op = SV->getOperand(0);
8830 else
8831 Op = SV->getOperand(1);
8832 }
8833 if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());
8834 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
8836 if (IdentityOp) {
8837 V = IdentityOp;
8838 assert(Mask.size() == IdentityMask.size() &&
8839 "Expected masks of same sizes.");
8840 // Clear known poison elements.
8841 for (auto [I, Idx] : enumerate(Mask))
8842 if (Idx == PoisonMaskElem)
8843 IdentityMask[I] = PoisonMaskElem;
8844 Mask.swap(IdentityMask);
8845 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
8846 return SinglePermute &&
8847 (isIdentityMask(Mask, cast<FixedVectorType>(V->getType()),
8848 /*IsStrict=*/true) ||
8849 (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
8850 Shuffle->isZeroEltSplat() &&
8852 }
8853 V = Op;
8854 return false;
8855 }
8856 V = Op;
8857 return true;
8858 }
8859
8860 /// Smart shuffle instruction emission, walks through shuffles trees and
8861 /// tries to find the best matching vector for the actual shuffle
8862 /// instruction.
8863 template <typename T, typename ShuffleBuilderTy>
8864 static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
8865 ShuffleBuilderTy &Builder) {
8866 assert(V1 && "Expected at least one vector value.");
8867 if (V2)
8868 Builder.resizeToMatch(V1, V2);
8869 int VF = Mask.size();
8870 if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
8871 VF = FTy->getNumElements();
8872 if (V2 &&
8873 !isUndefVector(V2, buildUseMask(VF, Mask, UseMask::SecondArg)).all()) {
8874 // Peek through shuffles.
8875 Value *Op1 = V1;
8876 Value *Op2 = V2;
8877 int VF =
8878 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
8879 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
8880 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
8881 for (int I = 0, E = Mask.size(); I < E; ++I) {
8882 if (Mask[I] < VF)
8883 CombinedMask1[I] = Mask[I];
8884 else
8885 CombinedMask2[I] = Mask[I] - VF;
8886 }
8887 Value *PrevOp1;
8888 Value *PrevOp2;
8889 do {
8890 PrevOp1 = Op1;
8891 PrevOp2 = Op2;
8892 (void)peekThroughShuffles(Op1, CombinedMask1, /*SinglePermute=*/false);
8893 (void)peekThroughShuffles(Op2, CombinedMask2, /*SinglePermute=*/false);
8894 // Check if we have 2 resizing shuffles - need to peek through operands
8895 // again.
8896 if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
8897 if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
8898 SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
8899 for (auto [Idx, I] : enumerate(CombinedMask1)) {
8900 if (I == PoisonMaskElem)
8901 continue;
8902 ExtMask1[Idx] = SV1->getMaskValue(I);
8903 }
8904 SmallBitVector UseMask1 = buildUseMask(
8905 cast<FixedVectorType>(SV1->getOperand(1)->getType())
8906 ->getNumElements(),
8907 ExtMask1, UseMask::SecondArg);
8908 SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
8909 for (auto [Idx, I] : enumerate(CombinedMask2)) {
8910 if (I == PoisonMaskElem)
8911 continue;
8912 ExtMask2[Idx] = SV2->getMaskValue(I);
8913 }
8914 SmallBitVector UseMask2 = buildUseMask(
8915 cast<FixedVectorType>(SV2->getOperand(1)->getType())
8916 ->getNumElements(),
8917 ExtMask2, UseMask::SecondArg);
8918 if (SV1->getOperand(0)->getType() ==
8919 SV2->getOperand(0)->getType() &&
8920 SV1->getOperand(0)->getType() != SV1->getType() &&
8921 isUndefVector(SV1->getOperand(1), UseMask1).all() &&
8922 isUndefVector(SV2->getOperand(1), UseMask2).all()) {
8923 Op1 = SV1->getOperand(0);
8924 Op2 = SV2->getOperand(0);
8925 SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
8926 int LocalVF = ShuffleMask1.size();
8927 if (auto *FTy = dyn_cast<FixedVectorType>(Op1->getType()))
8928 LocalVF = FTy->getNumElements();
8929 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
8930 CombinedMask1.swap(ShuffleMask1);
8931 SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
8932 LocalVF = ShuffleMask2.size();
8933 if (auto *FTy = dyn_cast<FixedVectorType>(Op2->getType()))
8934 LocalVF = FTy->getNumElements();
8935 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
8936 CombinedMask2.swap(ShuffleMask2);
8937 }
8938 }
8939 } while (PrevOp1 != Op1 || PrevOp2 != Op2);
8940 Builder.resizeToMatch(Op1, Op2);
8941 VF = std::max(cast<VectorType>(Op1->getType())
8942 ->getElementCount()
8943 .getKnownMinValue(),
8945 ->getElementCount()
8946 .getKnownMinValue());
8947 for (int I = 0, E = Mask.size(); I < E; ++I) {
8948 if (CombinedMask2[I] != PoisonMaskElem) {
8949 assert(CombinedMask1[I] == PoisonMaskElem &&
8950 "Expected undefined mask element");
8951 CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
8952 }
8953 }
8954 if (Op1 == Op2 &&
8955 (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) ||
8956 (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) &&
8958 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
8959 ArrayRef(CombinedMask1))))
8960 return Builder.createIdentity(Op1);
8961 return Builder.createShuffleVector(
8962 Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
8963 CombinedMask1);
8964 }
8965 if (isa<PoisonValue>(V1))
8966 return Builder.createPoison(
8967 cast<VectorType>(V1->getType())->getElementType(), Mask.size());
8968 SmallVector<int> NewMask(Mask);
8969 bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true);
8970 assert(V1 && "Expected non-null value after looking through shuffles.");
8971
8972 if (!IsIdentity)
8973 return Builder.createShuffleVector(V1, NewMask);
8974 return Builder.createIdentity(V1);
8975 }
8976};
8977} // namespace
8978
8979/// Calculate the scalar and the vector costs from vectorizing set of GEPs.
8980static std::pair<InstructionCost, InstructionCost>
8982 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
8983 Type *ScalarTy, VectorType *VecTy) {
8984 InstructionCost ScalarCost = 0;
8985 InstructionCost VecCost = 0;
8986 // Here we differentiate two cases: (1) when Ptrs represent a regular
8987 // vectorization tree node (as they are pointer arguments of scattered
8988 // loads) or (2) when Ptrs are the arguments of loads or stores being
8989 // vectorized as plane wide unit-stride load/store since all the
8990 // loads/stores are known to be from/to adjacent locations.
8991 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
8992 // Case 2: estimate costs for pointer related costs when vectorizing to
8993 // a wide load/store.
8994 // Scalar cost is estimated as a set of pointers with known relationship
8995 // between them.
8996 // For vector code we will use BasePtr as argument for the wide load/store
8997 // but we also need to account all the instructions which are going to
8998 // stay in vectorized code due to uses outside of these scalar
8999 // loads/stores.
9000 ScalarCost = TTI.getPointersChainCost(
9001 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
9002 CostKind);
9003
9004 SmallVector<const Value *> PtrsRetainedInVecCode;
9005 for (Value *V : Ptrs) {
9006 if (V == BasePtr) {
9007 PtrsRetainedInVecCode.push_back(V);
9008 continue;
9009 }
9011 // For simplicity assume Ptr to stay in vectorized code if it's not a
9012 // GEP instruction. We don't care since it's cost considered free.
9013 // TODO: We should check for any uses outside of vectorizable tree
9014 // rather than just single use.
9015 if (!Ptr || !Ptr->hasOneUse())
9016 PtrsRetainedInVecCode.push_back(V);
9017 }
9018
9019 if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
9020 // If all pointers stay in vectorized code then we don't have
9021 // any savings on that.
9022 return std::make_pair(TTI::TCC_Free, TTI::TCC_Free);
9023 }
9024 VecCost = TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
9025 TTI::PointersChainInfo::getKnownStride(),
9026 VecTy, CostKind);
9027 } else {
9028 // Case 1: Ptrs are the arguments of loads that we are going to transform
9029 // into masked gather load intrinsic.
9030 // All the scalar GEPs will be removed as a result of vectorization.
9031 // For any external uses of some lanes extract element instructions will
9032 // be generated (which cost is estimated separately).
9033 TTI::PointersChainInfo PtrsInfo =
9034 all_of(Ptrs,
9035 [](const Value *V) {
9037 return Ptr && !Ptr->hasAllConstantIndices();
9038 })
9039 ? TTI::PointersChainInfo::getUnknownStride()
9040 : TTI::PointersChainInfo::getKnownStride();
9041
9042 ScalarCost =
9043 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, CostKind);
9044 auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
9045 if (!BaseGEP) {
9046 auto *It = find_if(Ptrs, IsaPred<GEPOperator>);
9047 if (It != Ptrs.end())
9048 BaseGEP = cast<GEPOperator>(*It);
9049 }
9050 if (BaseGEP) {
9051 SmallVector<const Value *> Indices(BaseGEP->indices());
9052 VecCost = TTI.getGEPCost(BaseGEP->getSourceElementType(),
9053 BaseGEP->getPointerOperand(), Indices, VecTy,
9054 CostKind);
9055 }
9056 }
9057
9058 return std::make_pair(ScalarCost, VecCost);
9059}
9060
9063 BaseGraphSize = VectorizableTree.size();
9064 // Operands are profitable if they are:
9065 // 1. At least one constant
9066 // or
9067 // 2. Splats
9068 // or
9069 // 3. Results in good vectorization opportunity, i.e. may generate vector
9070 // nodes and reduce cost of the graph.
9071 auto CheckOperandsProfitability = [this](Instruction *I1, Instruction *I2,
9072 const InstructionsState &S) {
9074 for (unsigned Op : seq<unsigned>(S.MainOp->getNumOperands()))
9075 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
9076 I2->getOperand(Op));
9077 return all_of(
9078 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
9079 return all_of(Cand,
9080 [](const std::pair<Value *, Value *> &P) {
9081 return isa<Constant>(P.first) ||
9082 isa<Constant>(P.second) || P.first == P.second;
9083 }) ||
9085 });
9086 };
9087 // The tree may grow here, so iterate over nodes, built before.
9088 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
9089 TreeEntry &E = *VectorizableTree[Idx];
9090 if (E.isGather()) {
9091 ArrayRef<Value *> VL = E.Scalars;
9092 const unsigned Sz = getVectorElementSize(VL.front());
9093 unsigned MinVF = getMinVF(2 * Sz);
9094 // Do not try partial vectorization for small nodes (<= 2), nodes with the
9095 // same opcode and same parent block or all constants.
9096 if (VL.size() <= 2 ||
9097 !(!E.getOpcode() || E.getOpcode() == Instruction::Load ||
9098 E.isAltShuffle() || !allSameBlock(VL)) ||
9099 allConstant(VL) || isSplat(VL))
9100 continue;
9101 // Try to find vectorizable sequences and transform them into a series of
9102 // insertvector instructions.
9103 unsigned StartIdx = 0;
9104 unsigned End = VL.size();
9105 for (unsigned VF = VL.size() / 2; VF >= MinVF; VF = bit_ceil(VF) / 2) {
9106 SmallVector<unsigned> Slices;
9107 for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
9108 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
9109 // If any instruction is vectorized already - do not try again.
9110 // Reuse the existing node, if it fully matches the slice.
9111 if (const TreeEntry *SE = getTreeEntry(Slice.front());
9112 SE || getTreeEntry(Slice.back())) {
9113 if (!SE)
9114 continue;
9115 if (VF != SE->getVectorFactor() || !SE->isSame(Slice))
9116 continue;
9117 }
9118 // Constant already handled effectively - skip.
9119 if (allConstant(Slice))
9120 continue;
9121 // Do not try to vectorize small splats (less than vector register and
9122 // only with the single non-undef element).
9123 bool IsSplat = isSplat(Slice);
9124 if (Slices.empty() || !IsSplat ||
9125 (VF <= 2 && 2 * std::clamp(TTI->getNumberOfParts(getWidenedType(
9126 Slice.front()->getType(), VF)),
9127 1U, VF - 1) !=
9129 Slice.front()->getType(), 2 * VF)),
9130 1U, 2 * VF)) ||
9131 count(Slice, Slice.front()) ==
9132 (isa<UndefValue>(Slice.front()) ? VF - 1 : 1)) {
9133 if (IsSplat)
9134 continue;
9135 InstructionsState S = getSameOpcode(Slice, *TLI);
9136 if (!S.getOpcode() || S.isAltShuffle() || !allSameBlock(Slice))
9137 continue;
9138 if (VF == 2) {
9139 // Try to vectorize reduced values or if all users are vectorized.
9140 // For expensive instructions extra extracts might be profitable.
9141 if ((!UserIgnoreList || E.Idx != 0) &&
9144 !all_of(Slice, [&](Value *V) {
9145 return areAllUsersVectorized(cast<Instruction>(V),
9146 UserIgnoreList);
9147 }))
9148 continue;
9149 if (S.getOpcode() == Instruction::Load) {
9150 OrdersType Order;
9151 SmallVector<Value *> PointerOps;
9152 LoadsState Res =
9153 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps);
9154 // Do not vectorize gathers.
9155 if (Res == LoadsState::ScatterVectorize ||
9156 Res == LoadsState::Gather)
9157 continue;
9158 } else if (S.getOpcode() == Instruction::ExtractElement ||
9160 cast<Instruction>(Slice.front()), CostKind) <
9162 !CheckOperandsProfitability(
9163 cast<Instruction>(Slice.front()),
9164 cast<Instruction>(Slice.back()), S))) {
9165 // Do not vectorize extractelements (handled effectively
9166 // alread). Do not vectorize non-profitable instructions (with
9167 // low cost and non-vectorizable operands.)
9168 continue;
9169 }
9170 }
9171 }
9172 Slices.emplace_back(Cnt);
9173 }
9174 auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt) {
9175 E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
9176 if (StartIdx == Cnt)
9177 StartIdx = Cnt + VF;
9178 if (End == Cnt + VF)
9179 End = Cnt;
9180 };
9181 for (unsigned Cnt : Slices) {
9182 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
9183 // If any instruction is vectorized already - do not try again.
9184 if (const TreeEntry *SE = getTreeEntry(Slice.front());
9185 SE || getTreeEntry(Slice.back())) {
9186 if (!SE)
9187 continue;
9188 if (VF != SE->getVectorFactor() || !SE->isSame(Slice))
9189 continue;
9190 AddCombinedNode(SE->Idx, Cnt);
9191 continue;
9192 }
9193 unsigned PrevSize = VectorizableTree.size();
9194 buildTree_rec(Slice, 0, EdgeInfo(&E, UINT_MAX));
9195 if (PrevSize + 1 == VectorizableTree.size() &&
9196 VectorizableTree[PrevSize]->isGather() &&
9197 VectorizableTree[PrevSize]->getOpcode() !=
9198 Instruction::ExtractElement &&
9199 !isSplat(Slice)) {
9200 VectorizableTree.pop_back();
9201 continue;
9202 }
9203 AddCombinedNode(PrevSize, Cnt);
9204 }
9205 }
9206 }
9207 switch (E.getOpcode()) {
9208 case Instruction::Load: {
9209 // No need to reorder masked gather loads, just reorder the scalar
9210 // operands.
9211 if (E.State != TreeEntry::Vectorize)
9212 break;
9213 Type *ScalarTy = E.getMainOp()->getType();
9214 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
9215 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
9216 // Check if profitable to represent consecutive load + reverse as strided
9217 // load with stride -1.
9218 if (isReverseOrder(E.ReorderIndices) &&
9219 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
9220 SmallVector<int> Mask;
9221 inversePermutation(E.ReorderIndices, Mask);
9222 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
9223 InstructionCost OriginalVecCost =
9224 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
9229 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
9230 /*VariableMask=*/false, CommonAlignment, CostKind, BaseLI);
9231 if (StridedCost < OriginalVecCost)
9232 // Strided load is more profitable than consecutive load + reverse -
9233 // transform the node to strided load.
9234 E.State = TreeEntry::StridedVectorize;
9235 }
9236 break;
9237 }
9238 case Instruction::Store: {
9239 Type *ScalarTy =
9240 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
9241 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
9242 Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
9243 // Check if profitable to represent consecutive load + reverse as strided
9244 // load with stride -1.
9245 if (isReverseOrder(E.ReorderIndices) &&
9246 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
9247 SmallVector<int> Mask;
9248 inversePermutation(E.ReorderIndices, Mask);
9249 auto *BaseSI = cast<StoreInst>(E.Scalars.back());
9250 InstructionCost OriginalVecCost =
9251 TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
9256 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
9257 /*VariableMask=*/false, CommonAlignment, CostKind, BaseSI);
9258 if (StridedCost < OriginalVecCost)
9259 // Strided store is more profitable than reverse + consecutive store -
9260 // transform the node to strided store.
9261 E.State = TreeEntry::StridedVectorize;
9262 }
9263 break;
9264 }
9265 case Instruction::Select: {
9266 if (E.State != TreeEntry::Vectorize)
9267 break;
9268 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(E.Scalars);
9269 if (MinMaxID == Intrinsic::not_intrinsic)
9270 break;
9271 // This node is a minmax node.
9272 E.CombinedOp = TreeEntry::MinMax;
9273 TreeEntry *CondEntry = const_cast<TreeEntry *>(getOperandEntry(&E, 0));
9274 if (SelectOnly && CondEntry->UserTreeIndices.size() == 1 &&
9275 CondEntry->State == TreeEntry::Vectorize) {
9276 // The condition node is part of the combined minmax node.
9277 CondEntry->State = TreeEntry::CombinedVectorize;
9278 }
9279 break;
9280 }
9281 default:
9282 break;
9283 }
9284 }
9285
9286 // Single load node - exit.
9287 if (VectorizableTree.size() <= 1 &&
9288 VectorizableTree.front()->getOpcode() == Instruction::Load)
9289 return;
9290 // Small graph with small VF - exit.
9291 constexpr unsigned SmallTree = 3;
9292 constexpr unsigned SmallVF = 2;
9293 if ((VectorizableTree.size() <= SmallTree &&
9294 VectorizableTree.front()->Scalars.size() == SmallVF) ||
9295 (VectorizableTree.size() <= 2 && UserIgnoreList))
9296 return;
9297
9298 // A list of loads to be gathered during the vectorization process. We can
9299 // try to vectorize them at the end, if profitable.
9301
9302 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
9303 TreeEntry &E = *TE;
9304 if (E.isGather() &&
9305 (E.getOpcode() == Instruction::Load ||
9306 (!E.getOpcode() && any_of(E.Scalars,
9307 [&](Value *V) {
9308 return isa<LoadInst>(V) &&
9309 !isVectorized(V) &&
9310 !isDeleted(cast<Instruction>(V));
9311 }))) &&
9312 !isSplat(E.Scalars))
9313 gatherPossiblyVectorizableLoads(*this, E.Scalars, *DL, *SE, *TTI,
9314 GatheredLoads);
9315 }
9316 // Try to vectorize gathered loads if this is not just a gather of loads.
9317 if (!GatheredLoads.empty())
9318 tryToVectorizeGatheredLoads(GatheredLoads);
9319}
9320
9321/// Merges shuffle masks and emits final shuffle instruction, if required. It
9322/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
9323/// when the actual shuffle instruction is generated only if this is actually
9324/// required. Otherwise, the shuffle instruction emission is delayed till the
9325/// end of the process, to reduce the number of emitted instructions and further
9326/// analysis/transformations.
9327class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
9328 bool IsFinalized = false;
9329 SmallVector<int> CommonMask;
9331 const TargetTransformInfo &TTI;
9333 SmallDenseSet<Value *> VectorizedVals;
9334 BoUpSLP &R;
9335 SmallPtrSetImpl<Value *> &CheckedExtracts;
9337 /// While set, still trying to estimate the cost for the same nodes and we
9338 /// can delay actual cost estimation (virtual shuffle instruction emission).
9339 /// May help better estimate the cost if same nodes must be permuted + allows
9340 /// to move most of the long shuffles cost estimation to TTI.
9341 bool SameNodesEstimated = true;
9342
9343 static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
9344 if (Ty->getScalarType()->isPointerTy()) {
9346 ConstantInt::getAllOnesValue(
9347 IntegerType::get(Ty->getContext(),
9348 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
9349 Ty->getScalarType());
9350 if (auto *VTy = dyn_cast<VectorType>(Ty))
9351 Res = ConstantVector::getSplat(VTy->getElementCount(), Res);
9352 return Res;
9353 }
9354 return Constant::getAllOnesValue(Ty);
9355 }
9356
9357 InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
9358 if ((!Root && allConstant(VL)) || all_of(VL, IsaPred<UndefValue>))
9359 return TTI::TCC_Free;
9360 auto *VecTy = getWidenedType(ScalarTy, VL.size());
9361 InstructionCost GatherCost = 0;
9362 SmallVector<Value *> Gathers(VL);
9363 if (!Root && isSplat(VL)) {
9364 // Found the broadcasting of the single scalar, calculate the cost as
9365 // the broadcast.
9366 const auto *It = find_if_not(VL, IsaPred<UndefValue>);
9367 assert(It != VL.end() && "Expected at least one non-undef value.");
9368 // Add broadcast for non-identity shuffle only.
9369 bool NeedShuffle =
9370 count(VL, *It) > 1 &&
9371 (VL.front() != *It || !all_of(VL.drop_front(), IsaPred<UndefValue>));
9372 if (!NeedShuffle) {
9373 if (isa<FixedVectorType>(ScalarTy)) {
9374 assert(SLPReVec && "FixedVectorType is not expected.");
9375 return TTI.getShuffleCost(
9377 std::distance(VL.begin(), It) * getNumElements(ScalarTy),
9378 cast<FixedVectorType>(ScalarTy));
9379 }
9380 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
9381 CostKind, std::distance(VL.begin(), It),
9382 PoisonValue::get(VecTy), *It);
9383 }
9384
9385 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
9386 transform(VL, ShuffleMask.begin(), [](Value *V) {
9387 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
9388 });
9389 InstructionCost InsertCost =
9390 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
9391 PoisonValue::get(VecTy), *It);
9392 return InsertCost + ::getShuffleCost(TTI,
9394 VecTy, ShuffleMask, CostKind,
9395 /*Index=*/0, /*SubTp=*/nullptr,
9396 /*Args=*/*It);
9397 }
9398 return GatherCost +
9399 (all_of(Gathers, IsaPred<UndefValue>)
9401 : R.getGatherCost(Gathers, !Root && VL.equals(Gathers),
9402 ScalarTy));
9403 };
9404
9405 /// Compute the cost of creating a vector containing the extracted values from
9406 /// \p VL.
9408 computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
9409 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
9410 unsigned NumParts) {
9411 assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
9412 unsigned NumElts =
9413 std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) {
9414 auto *EE = dyn_cast<ExtractElementInst>(V);
9415 if (!EE)
9416 return Sz;
9417 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
9418 if (!VecTy)
9419 return Sz;
9420 return std::max(Sz, VecTy->getNumElements());
9421 });
9422 // FIXME: this must be moved to TTI for better estimation.
9423 unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts);
9424 auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,
9426 -> std::optional<TTI::ShuffleKind> {
9427 if (NumElts <= EltsPerVector)
9428 return std::nullopt;
9429 int OffsetReg0 =
9430 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
9431 [](int S, int I) {
9432 if (I == PoisonMaskElem)
9433 return S;
9434 return std::min(S, I);
9435 }),
9436 EltsPerVector);
9437 int OffsetReg1 = OffsetReg0;
9438 DenseSet<int> RegIndices;
9439 // Check that if trying to permute same single/2 input vectors.
9441 int FirstRegId = -1;
9442 Indices.assign(1, OffsetReg0);
9443 for (auto [Pos, I] : enumerate(Mask)) {
9444 if (I == PoisonMaskElem)
9445 continue;
9446 int Idx = I - OffsetReg0;
9447 int RegId =
9448 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
9449 if (FirstRegId < 0)
9450 FirstRegId = RegId;
9451 RegIndices.insert(RegId);
9452 if (RegIndices.size() > 2)
9453 return std::nullopt;
9454 if (RegIndices.size() == 2) {
9455 ShuffleKind = TTI::SK_PermuteTwoSrc;
9456 if (Indices.size() == 1) {
9457 OffsetReg1 = alignDown(
9458 std::accumulate(
9459 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
9460 [&](int S, int I) {
9461 if (I == PoisonMaskElem)
9462 return S;
9463 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
9464 ((I - OffsetReg0) % NumElts) / EltsPerVector;
9465 if (RegId == FirstRegId)
9466 return S;
9467 return std::min(S, I);
9468 }),
9469 EltsPerVector);
9470 Indices.push_back(OffsetReg1 % NumElts);
9471 }
9472 Idx = I - OffsetReg1;
9473 }
9474 I = (Idx % NumElts) % EltsPerVector +
9475 (RegId == FirstRegId ? 0 : EltsPerVector);
9476 }
9477 return ShuffleKind;
9478 };
9480
9481 // Process extracts in blocks of EltsPerVector to check if the source vector
9482 // operand can be re-used directly. If not, add the cost of creating a
9483 // shuffle to extract the values into a vector register.
9484 for (unsigned Part : seq<unsigned>(NumParts)) {
9485 if (!ShuffleKinds[Part])
9486 continue;
9487 ArrayRef<int> MaskSlice = Mask.slice(
9488 Part * EltsPerVector, getNumElems(Mask.size(), EltsPerVector, Part));
9489 SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
9490 copy(MaskSlice, SubMask.begin());
9492 std::optional<TTI::ShuffleKind> RegShuffleKind =
9493 CheckPerRegistersShuffle(SubMask, Indices);
9494 if (!RegShuffleKind) {
9495 if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc ||
9497 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.size())))
9498 Cost +=
9499 ::getShuffleCost(TTI, *ShuffleKinds[Part],
9500 getWidenedType(ScalarTy, NumElts), MaskSlice);
9501 continue;
9502 }
9503 if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
9504 !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
9505 Cost +=
9506 ::getShuffleCost(TTI, *RegShuffleKind,
9507 getWidenedType(ScalarTy, EltsPerVector), SubMask);
9508 }
9509 for (unsigned Idx : Indices) {
9510 assert((Idx + EltsPerVector) <= alignTo(NumElts, EltsPerVector) &&
9511 "SK_ExtractSubvector index out of range");
9514 getWidenedType(ScalarTy, alignTo(NumElts, EltsPerVector)), {},
9515 CostKind, Idx, getWidenedType(ScalarTy, EltsPerVector));
9516 }
9517 // Second attempt to check, if just a permute is better estimated than
9518 // subvector extract.
9519 SubMask.assign(NumElts, PoisonMaskElem);
9520 copy(MaskSlice, SubMask.begin());
9521 InstructionCost OriginalCost = ::getShuffleCost(
9522 TTI, *ShuffleKinds[Part], getWidenedType(ScalarTy, NumElts), SubMask);
9523 if (OriginalCost < Cost)
9524 Cost = OriginalCost;
9525 }
9526 return Cost;
9527 }
9528 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
9529 /// shuffle emission.
9530 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
9531 ArrayRef<int> Mask) {
9532 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
9533 if (Mask[Idx] != PoisonMaskElem)
9534 CommonMask[Idx] = Idx;
9535 }
9536 /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
9537 /// mask \p Mask, register number \p Part, that includes \p SliceSize
9538 /// elements.
9539 void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
9540 ArrayRef<int> Mask, unsigned Part,
9541 unsigned SliceSize) {
9542 if (SameNodesEstimated) {
9543 // Delay the cost estimation if the same nodes are reshuffling.
9544 // If we already requested the cost of reshuffling of E1 and E2 before, no
9545 // need to estimate another cost with the sub-Mask, instead include this
9546 // sub-Mask into the CommonMask to estimate it later and avoid double cost
9547 // estimation.
9548 if ((InVectors.size() == 2 &&
9549 InVectors.front().get<const TreeEntry *>() == &E1 &&
9550 InVectors.back().get<const TreeEntry *>() == E2) ||
9551 (!E2 && InVectors.front().get<const TreeEntry *>() == &E1)) {
9552 unsigned Limit = getNumElems(Mask.size(), SliceSize, Part);
9553 assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
9554 [](int Idx) { return Idx == PoisonMaskElem; }) &&
9555 "Expected all poisoned elements.");
9556 ArrayRef<int> SubMask = ArrayRef(Mask).slice(Part * SliceSize, Limit);
9557 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
9558 return;
9559 }
9560 // Found non-matching nodes - need to estimate the cost for the matched
9561 // and transform mask.
9562 Cost += createShuffle(InVectors.front(),
9563 InVectors.size() == 1 ? nullptr : InVectors.back(),
9564 CommonMask);
9565 transformMaskAfterShuffle(CommonMask, CommonMask);
9566 }
9567 SameNodesEstimated = false;
9568 if (!E2 && InVectors.size() == 1) {
9569 unsigned VF = E1.getVectorFactor();
9570 if (Value *V1 = InVectors.front().dyn_cast<Value *>()) {
9571 VF = std::max(VF,
9572 cast<FixedVectorType>(V1->getType())->getNumElements());
9573 } else {
9574 const auto *E = InVectors.front().get<const TreeEntry *>();
9575 VF = std::max(VF, E->getVectorFactor());
9576 }
9577 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
9578 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
9579 CommonMask[Idx] = Mask[Idx] + VF;
9580 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
9581 transformMaskAfterShuffle(CommonMask, CommonMask);
9582 } else {
9583 Cost += createShuffle(&E1, E2, Mask);
9584 transformMaskAfterShuffle(CommonMask, Mask);
9585 }
9586 }
9587
9588 class ShuffleCostBuilder {
9589 const TargetTransformInfo &TTI;
9590
9591 static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
9592 int Index = -1;
9593 return Mask.empty() ||
9594 (VF == Mask.size() &&
9597 Index == 0);
9598 }
9599
9600 public:
9601 ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
9602 ~ShuffleCostBuilder() = default;
9603 InstructionCost createShuffleVector(Value *V1, Value *,
9604 ArrayRef<int> Mask) const {
9605 // Empty mask or identity mask are free.
9606 unsigned VF =
9607 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
9608 if (isEmptyOrIdentity(Mask, VF))
9609 return TTI::TCC_Free;
9610 return ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
9611 cast<VectorType>(V1->getType()), Mask);
9612 }
9613 InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
9614 // Empty mask or identity mask are free.
9615 unsigned VF =
9616 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
9617 if (isEmptyOrIdentity(Mask, VF))
9618 return TTI::TCC_Free;
9619 return ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc,
9620 cast<VectorType>(V1->getType()), Mask);
9621 }
9622 InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
9623 InstructionCost createPoison(Type *Ty, unsigned VF) const {
9624 return TTI::TCC_Free;
9625 }
9626 void resizeToMatch(Value *&, Value *&) const {}
9627 };
9628
9629 /// Smart shuffle instruction emission, walks through shuffles trees and
9630 /// tries to find the best matching vector for the actual shuffle
9631 /// instruction.
9633 createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1,
9635 ArrayRef<int> Mask) {
9636 ShuffleCostBuilder Builder(TTI);
9637 SmallVector<int> CommonMask(Mask);
9638 Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
9639 unsigned CommonVF = Mask.size();
9640 InstructionCost ExtraCost = 0;
9641 auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E,
9642 unsigned VF) -> InstructionCost {
9643 if (E.isGather() && allConstant(E.Scalars))
9644 return TTI::TCC_Free;
9645 Type *EScalarTy = E.Scalars.front()->getType();
9646 bool IsSigned = true;
9647 if (auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
9648 EScalarTy = IntegerType::get(EScalarTy->getContext(), It->second.first);
9649 IsSigned = It->second.second;
9650 }
9651 if (EScalarTy != ScalarTy) {
9652 unsigned CastOpcode = Instruction::Trunc;
9653 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
9654 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
9655 if (DstSz > SrcSz)
9656 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
9657 return TTI.getCastInstrCost(CastOpcode, getWidenedType(ScalarTy, VF),
9658 getWidenedType(EScalarTy, VF),
9660 }
9661 return TTI::TCC_Free;
9662 };
9663 auto GetValueMinBWAffectedCost = [&](const Value *V) -> InstructionCost {
9664 if (isa<Constant>(V))
9665 return TTI::TCC_Free;
9666 auto *VecTy = cast<VectorType>(V->getType());
9667 Type *EScalarTy = VecTy->getElementType();
9668 if (EScalarTy != ScalarTy) {
9669 bool IsSigned = !isKnownNonNegative(V, SimplifyQuery(*R.DL));
9670 unsigned CastOpcode = Instruction::Trunc;
9671 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
9672 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
9673 if (DstSz > SrcSz)
9674 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
9675 return TTI.getCastInstrCost(
9676 CastOpcode, VectorType::get(ScalarTy, VecTy->getElementCount()),
9678 }
9679 return TTI::TCC_Free;
9680 };
9681 if (!V1 && !V2 && !P2.isNull()) {
9682 // Shuffle 2 entry nodes.
9683 const TreeEntry *E = P1.get<const TreeEntry *>();
9684 unsigned VF = E->getVectorFactor();
9685 const TreeEntry *E2 = P2.get<const TreeEntry *>();
9686 CommonVF = std::max(VF, E2->getVectorFactor());
9687 assert(all_of(Mask,
9688 [=](int Idx) {
9689 return Idx < 2 * static_cast<int>(CommonVF);
9690 }) &&
9691 "All elements in mask must be less than 2 * CommonVF.");
9692 if (E->Scalars.size() == E2->Scalars.size()) {
9693 SmallVector<int> EMask = E->getCommonMask();
9694 SmallVector<int> E2Mask = E2->getCommonMask();
9695 if (!EMask.empty() || !E2Mask.empty()) {
9696 for (int &Idx : CommonMask) {
9697 if (Idx == PoisonMaskElem)
9698 continue;
9699 if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
9700 Idx = EMask[Idx];
9701 else if (Idx >= static_cast<int>(CommonVF))
9702 Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
9703 E->Scalars.size();
9704 }
9705 }
9706 CommonVF = E->Scalars.size();
9707 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
9708 GetNodeMinBWAffectedCost(*E2, CommonVF);
9709 } else {
9710 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
9711 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
9712 }
9713 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
9714 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
9715 } else if (!V1 && P2.isNull()) {
9716 // Shuffle single entry node.
9717 const TreeEntry *E = P1.get<const TreeEntry *>();
9718 unsigned VF = E->getVectorFactor();
9719 CommonVF = VF;
9720 assert(
9721 all_of(Mask,
9722 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
9723 "All elements in mask must be less than CommonVF.");
9724 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
9725 SmallVector<int> EMask = E->getCommonMask();
9726 assert(!EMask.empty() && "Expected non-empty common mask.");
9727 for (int &Idx : CommonMask) {
9728 if (Idx != PoisonMaskElem)
9729 Idx = EMask[Idx];
9730 }
9731 CommonVF = E->Scalars.size();
9732 }
9733 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
9734 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
9735 // Not identity/broadcast? Try to see if the original vector is better.
9736 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
9737 CommonVF == CommonMask.size() &&
9738 any_of(enumerate(CommonMask),
9739 [](const auto &&P) {
9740 return P.value() != PoisonMaskElem &&
9741 static_cast<unsigned>(P.value()) != P.index();
9742 }) &&
9743 any_of(CommonMask,
9744 [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) {
9745 SmallVector<int> ReorderMask;
9746 inversePermutation(E->ReorderIndices, ReorderMask);
9747 ::addMask(CommonMask, ReorderMask);
9748 }
9749 } else if (V1 && P2.isNull()) {
9750 // Shuffle single vector.
9751 ExtraCost += GetValueMinBWAffectedCost(V1);
9752 CommonVF = getVF(V1);
9753 assert(
9754 all_of(Mask,
9755 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
9756 "All elements in mask must be less than CommonVF.");
9757 } else if (V1 && !V2) {
9758 // Shuffle vector and tree node.
9759 unsigned VF = getVF(V1);
9760 const TreeEntry *E2 = P2.get<const TreeEntry *>();
9761 CommonVF = std::max(VF, E2->getVectorFactor());
9762 assert(all_of(Mask,
9763 [=](int Idx) {
9764 return Idx < 2 * static_cast<int>(CommonVF);
9765 }) &&
9766 "All elements in mask must be less than 2 * CommonVF.");
9767 if (E2->Scalars.size() == VF && VF != CommonVF) {
9768 SmallVector<int> E2Mask = E2->getCommonMask();
9769 assert(!E2Mask.empty() && "Expected non-empty common mask.");
9770 for (int &Idx : CommonMask) {
9771 if (Idx == PoisonMaskElem)
9772 continue;
9773 if (Idx >= static_cast<int>(CommonVF))
9774 Idx = E2Mask[Idx - CommonVF] + VF;
9775 }
9776 CommonVF = VF;
9777 }
9778 ExtraCost += GetValueMinBWAffectedCost(V1);
9779 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
9780 ExtraCost += GetNodeMinBWAffectedCost(
9781 *E2, std::min(CommonVF, E2->getVectorFactor()));
9782 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
9783 } else if (!V1 && V2) {
9784 // Shuffle vector and tree node.
9785 unsigned VF = getVF(V2);
9786 const TreeEntry *E1 = P1.get<const TreeEntry *>();
9787 CommonVF = std::max(VF, E1->getVectorFactor());
9788 assert(all_of(Mask,
9789 [=](int Idx) {
9790 return Idx < 2 * static_cast<int>(CommonVF);
9791 }) &&
9792 "All elements in mask must be less than 2 * CommonVF.");
9793 if (E1->Scalars.size() == VF && VF != CommonVF) {
9794 SmallVector<int> E1Mask = E1->getCommonMask();
9795 assert(!E1Mask.empty() && "Expected non-empty common mask.");
9796 for (int &Idx : CommonMask) {
9797 if (Idx == PoisonMaskElem)
9798 continue;
9799 if (Idx >= static_cast<int>(CommonVF))
9800 Idx = E1Mask[Idx - CommonVF] + VF;
9801 else
9802 Idx = E1Mask[Idx];
9803 }
9804 CommonVF = VF;
9805 }
9806 ExtraCost += GetNodeMinBWAffectedCost(
9807 *E1, std::min(CommonVF, E1->getVectorFactor()));
9808 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
9809 ExtraCost += GetValueMinBWAffectedCost(V2);
9810 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
9811 } else {
9812 assert(V1 && V2 && "Expected both vectors.");
9813 unsigned VF = getVF(V1);
9814 CommonVF = std::max(VF, getVF(V2));
9815 assert(all_of(Mask,
9816 [=](int Idx) {
9817 return Idx < 2 * static_cast<int>(CommonVF);
9818 }) &&
9819 "All elements in mask must be less than 2 * CommonVF.");
9820 ExtraCost +=
9821 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
9822 if (V1->getType() != V2->getType()) {
9823 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
9824 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
9825 } else {
9826 if (cast<VectorType>(V1->getType())->getElementType() != ScalarTy)
9827 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
9828 if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
9829 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
9830 }
9831 }
9832 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
9833 assert(SLPReVec && "FixedVectorType is not expected.");
9835 CommonMask);
9836 }
9837 InVectors.front() =
9838 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
9839 if (InVectors.size() == 2)
9840 InVectors.pop_back();
9841 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
9842 V1, V2, CommonMask, Builder);
9843 }
9844
9845public:
9847 ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
9848 SmallPtrSetImpl<Value *> &CheckedExtracts)
9849 : BaseShuffleAnalysis(ScalarTy), TTI(TTI),
9850 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
9851 CheckedExtracts(CheckedExtracts) {}
9852 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
9853 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
9854 unsigned NumParts, bool &UseVecBaseAsInput) {
9855 UseVecBaseAsInput = false;
9856 if (Mask.empty())
9857 return nullptr;
9858 Value *VecBase = nullptr;
9859 ArrayRef<Value *> VL = E->Scalars;
9860 // Check if it can be considered reused if same extractelements were
9861 // vectorized already.
9862 bool PrevNodeFound = any_of(
9863 ArrayRef(R.VectorizableTree).take_front(E->Idx),
9864 [&](const std::unique_ptr<TreeEntry> &TE) {
9865 return ((!TE->isAltShuffle() &&
9866 TE->getOpcode() == Instruction::ExtractElement) ||
9867 TE->isGather()) &&
9868 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
9869 return VL.size() > Data.index() &&
9870 (Mask[Data.index()] == PoisonMaskElem ||
9871 isa<UndefValue>(VL[Data.index()]) ||
9872 Data.value() == VL[Data.index()]);
9873 });
9874 });
9875 SmallPtrSet<Value *, 4> UniqueBases;
9876 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
9877 for (unsigned Part : seq<unsigned>(NumParts)) {
9878 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
9879 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
9880 for (auto [I, V] : enumerate(VL.slice(Part * SliceSize, Limit))) {
9881 // Ignore non-extractelement scalars.
9882 if (isa<UndefValue>(V) ||
9883 (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
9884 continue;
9885 // If all users of instruction are going to be vectorized and this
9886 // instruction itself is not going to be vectorized, consider this
9887 // instruction as dead and remove its cost from the final cost of the
9888 // vectorized tree.
9889 // Also, avoid adjusting the cost for extractelements with multiple uses
9890 // in different graph entries.
9891 auto *EE = cast<ExtractElementInst>(V);
9892 VecBase = EE->getVectorOperand();
9893 UniqueBases.insert(VecBase);
9894 const TreeEntry *VE = R.getTreeEntry(V);
9895 if (!CheckedExtracts.insert(V).second ||
9896 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
9897 any_of(EE->users(),
9898 [&](User *U) {
9899 return isa<GetElementPtrInst>(U) &&
9900 !R.areAllUsersVectorized(cast<Instruction>(U),
9901 &VectorizedVals);
9902 }) ||
9903 (VE && VE != E))
9904 continue;
9905 std::optional<unsigned> EEIdx = getExtractIndex(EE);
9906 if (!EEIdx)
9907 continue;
9908 unsigned Idx = *EEIdx;
9909 // Take credit for instruction that will become dead.
9910 if (EE->hasOneUse() || !PrevNodeFound) {
9911 Instruction *Ext = EE->user_back();
9912 if (isa<SExtInst, ZExtInst>(Ext) &&
9913 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
9914 // Use getExtractWithExtendCost() to calculate the cost of
9915 // extractelement/ext pair.
9916 Cost -=
9917 TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
9918 EE->getVectorOperandType(), Idx);
9919 // Add back the cost of s|zext which is subtracted separately.
9921 Ext->getOpcode(), Ext->getType(), EE->getType(),
9923 continue;
9924 }
9925 }
9926 Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(),
9927 CostKind, Idx);
9928 }
9929 }
9930 // Check that gather of extractelements can be represented as just a
9931 // shuffle of a single/two vectors the scalars are extracted from.
9932 // Found the bunch of extractelement instructions that must be gathered
9933 // into a vector and can be represented as a permutation elements in a
9934 // single input vector or of 2 input vectors.
9935 // Done for reused if same extractelements were vectorized already.
9936 if (!PrevNodeFound)
9937 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
9938 InVectors.assign(1, E);
9939 CommonMask.assign(Mask.begin(), Mask.end());
9940 transformMaskAfterShuffle(CommonMask, CommonMask);
9941 SameNodesEstimated = false;
9942 if (NumParts != 1 && UniqueBases.size() != 1) {
9943 UseVecBaseAsInput = true;
9944 VecBase =
9945 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
9946 }
9947 return VecBase;
9948 }
9949 /// Checks if the specified entry \p E needs to be delayed because of its
9950 /// dependency nodes.
9951 std::optional<InstructionCost>
9952 needToDelay(const TreeEntry *,
9954 // No need to delay the cost estimation during analysis.
9955 return std::nullopt;
9956 }
9957 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
9958 if (&E1 == &E2) {
9959 assert(all_of(Mask,
9960 [&](int Idx) {
9961 return Idx < static_cast<int>(E1.getVectorFactor());
9962 }) &&
9963 "Expected single vector shuffle mask.");
9964 add(E1, Mask);
9965 return;
9966 }
9967 if (InVectors.empty()) {
9968 CommonMask.assign(Mask.begin(), Mask.end());
9969 InVectors.assign({&E1, &E2});
9970 return;
9971 }
9972 assert(!CommonMask.empty() && "Expected non-empty common mask.");
9973 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
9974 unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
9975 if (NumParts == 0 || NumParts >= Mask.size() ||
9976 MaskVecTy->getNumElements() % NumParts != 0 ||
9977 !hasFullVectorsOrPowerOf2(TTI, MaskVecTy->getElementType(),
9978 MaskVecTy->getNumElements() / NumParts))
9979 NumParts = 1;
9980 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
9981 const auto *It =
9982 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
9983 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
9984 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
9985 }
9986 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
9987 if (InVectors.empty()) {
9988 CommonMask.assign(Mask.begin(), Mask.end());
9989 InVectors.assign(1, &E1);
9990 return;
9991 }
9992 assert(!CommonMask.empty() && "Expected non-empty common mask.");
9993 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
9994 unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
9995 if (NumParts == 0 || NumParts >= Mask.size() ||
9996 MaskVecTy->getNumElements() % NumParts != 0 ||
9997 !hasFullVectorsOrPowerOf2(TTI, MaskVecTy->getElementType(),
9998 MaskVecTy->getNumElements() / NumParts))
9999 NumParts = 1;
10000 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
10001 const auto *It =
10002 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
10003 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
10004 estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize);
10005 if (!SameNodesEstimated && InVectors.size() == 1)
10006 InVectors.emplace_back(&E1);
10007 }
10008 /// Adds 2 input vectors and the mask for their shuffling.
10009 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
10010 // May come only for shuffling of 2 vectors with extractelements, already
10011 // handled in adjustExtracts.
10012 assert(InVectors.size() == 1 &&
10013 all_of(enumerate(CommonMask),
10014 [&](auto P) {
10015 if (P.value() == PoisonMaskElem)
10016 return Mask[P.index()] == PoisonMaskElem;
10017 auto *EI =
10019 .get<const TreeEntry *>()
10020 ->Scalars[P.index()]);
10021 return EI->getVectorOperand() == V1 ||
10022 EI->getVectorOperand() == V2;
10023 }) &&
10024 "Expected extractelement vectors.");
10025 }
10026 /// Adds another one input vector and the mask for the shuffling.
10027 void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) {
10028 if (InVectors.empty()) {
10029 assert(CommonMask.empty() && !ForExtracts &&
10030 "Expected empty input mask/vectors.");
10031 CommonMask.assign(Mask.begin(), Mask.end());
10032 InVectors.assign(1, V1);
10033 return;
10034 }
10035 if (ForExtracts) {
10036 // No need to add vectors here, already handled them in adjustExtracts.
10037 assert(InVectors.size() == 1 &&
10038 InVectors.front().is<const TreeEntry *>() && !CommonMask.empty() &&
10039 all_of(enumerate(CommonMask),
10040 [&](auto P) {
10041 Value *Scalar = InVectors.front()
10042 .get<const TreeEntry *>()
10043 ->Scalars[P.index()];
10044 if (P.value() == PoisonMaskElem)
10045 return P.value() == Mask[P.index()] ||
10046 isa<UndefValue>(Scalar);
10047 if (isa<Constant>(V1))
10048 return true;
10049 auto *EI = cast<ExtractElementInst>(Scalar);
10050 return EI->getVectorOperand() == V1;
10051 }) &&
10052 "Expected only tree entry for extractelement vectors.");
10053 return;
10054 }
10055 assert(!InVectors.empty() && !CommonMask.empty() &&
10056 "Expected only tree entries from extracts/reused buildvectors.");
10057 unsigned VF = getVF(V1);
10058 if (InVectors.size() == 2) {
10059 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
10060 transformMaskAfterShuffle(CommonMask, CommonMask);
10061 VF = std::max<unsigned>(VF, CommonMask.size());
10062 } else if (const auto *InTE =
10063 InVectors.front().dyn_cast<const TreeEntry *>()) {
10064 VF = std::max(VF, InTE->getVectorFactor());
10065 } else {
10066 VF = std::max(
10067 VF, cast<FixedVectorType>(InVectors.front().get<Value *>()->getType())
10068 ->getNumElements());
10069 }
10070 InVectors.push_back(V1);
10071 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10072 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
10073 CommonMask[Idx] = Mask[Idx] + VF;
10074 }
10075 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
10076 Value *Root = nullptr) {
10077 Cost += getBuildVectorCost(VL, Root);
10078 if (!Root) {
10079 // FIXME: Need to find a way to avoid use of getNullValue here.
10081 unsigned VF = VL.size();
10082 if (MaskVF != 0)
10083 VF = std::min(VF, MaskVF);
10084 for (Value *V : VL.take_front(VF)) {
10085 if (isa<UndefValue>(V)) {
10086 Vals.push_back(cast<Constant>(V));
10087 continue;
10088 }
10089 Vals.push_back(Constant::getNullValue(V->getType()));
10090 }
10091 if (auto *VecTy = dyn_cast<FixedVectorType>(Vals.front()->getType())) {
10092 assert(SLPReVec && "FixedVectorType is not expected.");
10093 // When REVEC is enabled, we need to expand vector types into scalar
10094 // types.
10095 unsigned VecTyNumElements = VecTy->getNumElements();
10096 SmallVector<Constant *> NewVals(VF * VecTyNumElements, nullptr);
10097 for (auto [I, V] : enumerate(Vals)) {
10098 Type *ScalarTy = V->getType()->getScalarType();
10099 Constant *NewVal;
10100 if (isa<PoisonValue>(V))
10101 NewVal = PoisonValue::get(ScalarTy);
10102 else if (isa<UndefValue>(V))
10103 NewVal = UndefValue::get(ScalarTy);
10104 else
10105 NewVal = Constant::getNullValue(ScalarTy);
10106 std::fill_n(NewVals.begin() + I * VecTyNumElements, VecTyNumElements,
10107 NewVal);
10108 }
10109 Vals.swap(NewVals);
10110 }
10111 return ConstantVector::get(Vals);
10112 }
10115 cast<FixedVectorType>(Root->getType())->getNumElements()),
10116 getAllOnesValue(*R.DL, ScalarTy->getScalarType()));
10117 }
10119 /// Finalize emission of the shuffles.
10122 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
10123 unsigned VF = 0,
10124 function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
10125 IsFinalized = true;
10126 if (Action) {
10127 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
10128 if (InVectors.size() == 2)
10129 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
10130 else
10131 Cost += createShuffle(Vec, nullptr, CommonMask);
10132 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10133 if (CommonMask[Idx] != PoisonMaskElem)
10134 CommonMask[Idx] = Idx;
10135 assert(VF > 0 &&
10136 "Expected vector length for the final value before action.");
10137 Value *V = Vec.get<Value *>();
10138 Action(V, CommonMask);
10139 InVectors.front() = V;
10140 }
10141 if (!SubVectors.empty()) {
10142 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
10143 if (InVectors.size() == 2)
10144 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
10145 else
10146 Cost += createShuffle(Vec, nullptr, CommonMask);
10147 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10148 if (CommonMask[Idx] != PoisonMaskElem)
10149 CommonMask[Idx] = Idx;
10150 for (auto [E, Idx] : SubVectors) {
10151 Type *EScalarTy = E->Scalars.front()->getType();
10152 bool IsSigned = true;
10153 if (auto It = R.MinBWs.find(E); It != R.MinBWs.end()) {
10154 EScalarTy =
10155 IntegerType::get(EScalarTy->getContext(), It->second.first);
10156 IsSigned = It->second.second;
10157 }
10158 if (ScalarTy != EScalarTy) {
10159 unsigned CastOpcode = Instruction::Trunc;
10160 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
10161 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
10162 if (DstSz > SrcSz)
10163 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10165 CastOpcode, getWidenedType(ScalarTy, E->getVectorFactor()),
10166 getWidenedType(EScalarTy, E->getVectorFactor()),
10168 }
10169 Cost += ::getShuffleCost(
10171 getWidenedType(ScalarTy, CommonMask.size()), {}, CostKind, Idx,
10172 getWidenedType(ScalarTy, E->getVectorFactor()));
10173 if (!CommonMask.empty()) {
10174 std::iota(std::next(CommonMask.begin(), Idx),
10175 std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
10176 Idx);
10177 }
10178 }
10179 }
10180
10181 ::addMask(CommonMask, ExtMask, /*ExtendingManyInputs=*/true);
10182 if (CommonMask.empty()) {
10183 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
10184 return Cost;
10185 }
10186 return Cost +
10187 createShuffle(InVectors.front(),
10188 InVectors.size() == 2 ? InVectors.back() : nullptr,
10189 CommonMask);
10190 }
10191
10193 assert((IsFinalized || CommonMask.empty()) &&
10194 "Shuffle construction must be finalized.");
10195 }
10196};
10197
10198const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
10199 unsigned Idx) const {
10200 if (const TreeEntry *VE = getMatchedVectorizedOperand(E, Idx))
10201 return VE;
10202 const auto *It =
10203 find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
10204 return TE->isGather() &&
10205 find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
10206 return EI.EdgeIdx == Idx && EI.UserTE == E;
10207 }) != TE->UserTreeIndices.end();
10208 });
10209 assert(It != VectorizableTree.end() && "Expected vectorizable entry.");
10210 return It->get();
10211}
10212
10213TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
10214 if (TE.State == TreeEntry::ScatterVectorize ||
10215 TE.State == TreeEntry::StridedVectorize)
10217 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
10218 !TE.isAltShuffle()) {
10219 if (TE.ReorderIndices.empty())
10221 SmallVector<int> Mask;
10222 inversePermutation(TE.ReorderIndices, Mask);
10223 if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
10225 }
10227}
10228
10229/// Builds the arguments types vector for the given call instruction with the
10230/// given \p ID for the specified vector factor.
10232 const Intrinsic::ID ID,
10233 const unsigned VF,
10234 unsigned MinBW) {
10235 SmallVector<Type *> ArgTys;
10236 for (auto [Idx, Arg] : enumerate(CI->args())) {
10237 if (ID != Intrinsic::not_intrinsic) {
10239 ArgTys.push_back(Arg->getType());
10240 continue;
10241 }
10242 if (MinBW > 0) {
10243 ArgTys.push_back(
10244 getWidenedType(IntegerType::get(CI->getContext(), MinBW), VF));
10245 continue;
10246 }
10247 }
10248 ArgTys.push_back(getWidenedType(Arg->getType(), VF));
10249 }
10250 return ArgTys;
10251}
10252
10254BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
10255 SmallPtrSetImpl<Value *> &CheckedExtracts) {
10256 ArrayRef<Value *> VL = E->Scalars;
10257
10258 Type *ScalarTy = getValueType(VL[0]);
10259 if (!isValidElementType(ScalarTy))
10262
10263 // If we have computed a smaller type for the expression, update VecTy so
10264 // that the costs will be accurate.
10265 auto It = MinBWs.find(E);
10266 Type *OrigScalarTy = ScalarTy;
10267 if (It != MinBWs.end()) {
10268 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
10269 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
10270 if (VecTy)
10271 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
10272 }
10273 auto *VecTy = getWidenedType(ScalarTy, VL.size());
10274 unsigned EntryVF = E->getVectorFactor();
10275 auto *FinalVecTy = getWidenedType(ScalarTy, EntryVF);
10276
10277 bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
10278 if (E->isGather()) {
10279 if (allConstant(VL))
10280 return 0;
10281 if (isa<InsertElementInst>(VL[0]))
10283 if (isa<CmpInst>(VL.front()))
10284 ScalarTy = VL.front()->getType();
10285 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
10286 E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
10287 }
10288 InstructionCost CommonCost = 0;
10290 bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
10291 if (!E->ReorderIndices.empty() &&
10292 (E->State != TreeEntry::StridedVectorize || !IsReverseOrder)) {
10293 SmallVector<int> NewMask;
10294 if (E->getOpcode() == Instruction::Store) {
10295 // For stores the order is actually a mask.
10296 NewMask.resize(E->ReorderIndices.size());
10297 copy(E->ReorderIndices, NewMask.begin());
10298 } else {
10299 inversePermutation(E->ReorderIndices, NewMask);
10300 }
10301 ::addMask(Mask, NewMask);
10302 }
10303 if (NeedToShuffleReuses)
10304 ::addMask(Mask, E->ReuseShuffleIndices);
10305 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
10306 CommonCost =
10307 ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
10308 assert((E->State == TreeEntry::Vectorize ||
10309 E->State == TreeEntry::ScatterVectorize ||
10310 E->State == TreeEntry::StridedVectorize) &&
10311 "Unhandled state");
10312 assert(E->getOpcode() &&
10313 ((allSameType(VL) && allSameBlock(VL)) ||
10314 (E->getOpcode() == Instruction::GetElementPtr &&
10315 E->getMainOp()->getType()->isPointerTy())) &&
10316 "Invalid VL");
10317 Instruction *VL0 = E->getMainOp();
10318 unsigned ShuffleOrOp =
10319 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
10320 if (E->CombinedOp != TreeEntry::NotCombinedOp)
10321 ShuffleOrOp = E->CombinedOp;
10322 SmallSetVector<Value *, 16> UniqueValues(VL.begin(), VL.end());
10323 const unsigned Sz = UniqueValues.size();
10324 SmallBitVector UsedScalars(Sz, false);
10325 for (unsigned I = 0; I < Sz; ++I) {
10326 if (getTreeEntry(UniqueValues[I]) == E)
10327 continue;
10328 UsedScalars.set(I);
10329 }
10330 auto GetCastContextHint = [&](Value *V) {
10331 if (const TreeEntry *OpTE = getTreeEntry(V))
10332 return getCastContextHint(*OpTE);
10333 InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);
10334 if (SrcState.getOpcode() == Instruction::Load && !SrcState.isAltShuffle())
10337 };
10338 auto GetCostDiff =
10339 [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
10341 // Calculate the cost of this instruction.
10342 InstructionCost ScalarCost = 0;
10343 if (isa<CastInst, CallInst>(VL0)) {
10344 // For some of the instructions no need to calculate cost for each
10345 // particular instruction, we can use the cost of the single
10346 // instruction x total number of scalar instructions.
10347 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
10348 } else {
10349 for (unsigned I = 0; I < Sz; ++I) {
10350 if (UsedScalars.test(I))
10351 continue;
10352 ScalarCost += ScalarEltCost(I);
10353 }
10354 }
10355
10356 InstructionCost VecCost = VectorCost(CommonCost);
10357 // Check if the current node must be resized, if the parent node is not
10358 // resized.
10359 if (It != MinBWs.end() && !UnaryInstruction::isCast(E->getOpcode()) &&
10360 E->Idx != 0 &&
10361 (E->getOpcode() != Instruction::Load ||
10362 !E->UserTreeIndices.empty())) {
10363 const EdgeInfo &EI =
10364 *find_if(E->UserTreeIndices, [](const EdgeInfo &EI) {
10365 return !EI.UserTE->isGather() || EI.EdgeIdx != UINT_MAX;
10366 });
10367 if (EI.UserTE->getOpcode() != Instruction::Select ||
10368 EI.EdgeIdx != 0) {
10369 auto UserBWIt = MinBWs.find(EI.UserTE);
10370 Type *UserScalarTy =
10371 EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
10372 if (UserBWIt != MinBWs.end())
10373 UserScalarTy = IntegerType::get(ScalarTy->getContext(),
10374 UserBWIt->second.first);
10375 if (ScalarTy != UserScalarTy) {
10376 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
10377 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
10378 unsigned VecOpcode;
10379 auto *UserVecTy = getWidenedType(UserScalarTy, E->Scalars.size());
10380 if (BWSz > SrcBWSz)
10381 VecOpcode = Instruction::Trunc;
10382 else
10383 VecOpcode =
10384 It->second.second ? Instruction::SExt : Instruction::ZExt;
10385 TTI::CastContextHint CCH = GetCastContextHint(VL0);
10386 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
10387 CostKind);
10388 }
10389 }
10390 }
10391 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
10392 ScalarCost, "Calculated costs for Tree"));
10393 return VecCost - ScalarCost;
10394 };
10395 // Calculate cost difference from vectorizing set of GEPs.
10396 // Negative value means vectorizing is profitable.
10397 auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
10398 assert((E->State == TreeEntry::Vectorize ||
10399 E->State == TreeEntry::StridedVectorize) &&
10400 "Entry state expected to be Vectorize or StridedVectorize here.");
10401 InstructionCost ScalarCost = 0;
10402 InstructionCost VecCost = 0;
10403 std::tie(ScalarCost, VecCost) = getGEPCosts(
10404 *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy, VecTy);
10405 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
10406 "Calculated GEPs cost for Tree"));
10407
10408 return VecCost - ScalarCost;
10409 };
10410
10411 auto GetMinMaxCost = [&](Type *Ty, Instruction *VI = nullptr) {
10412 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VI ? VI : VL);
10413 if (MinMaxID == Intrinsic::not_intrinsic)
10415 Type *CanonicalType = Ty;
10416 if (CanonicalType->isPtrOrPtrVectorTy())
10417 CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
10418 CanonicalType->getContext(),
10419 DL->getTypeSizeInBits(CanonicalType->getScalarType())));
10420
10421 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
10422 {CanonicalType, CanonicalType});
10423 InstructionCost IntrinsicCost =
10424 TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
10425 // If the selects are the only uses of the compares, they will be
10426 // dead and we can adjust the cost by removing their cost.
10427 if (VI && SelectOnly) {
10428 assert(!Ty->isVectorTy() && "Expected only for scalar type.");
10429 auto *CI = cast<CmpInst>(VI->getOperand(0));
10430 IntrinsicCost -= TTI->getCmpSelInstrCost(
10431 CI->getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
10432 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
10433 {TTI::OK_AnyValue, TTI::OP_None}, CI);
10434 }
10435 return IntrinsicCost;
10436 };
10437 switch (ShuffleOrOp) {
10438 case Instruction::PHI: {
10439 // Count reused scalars.
10440 InstructionCost ScalarCost = 0;
10442 for (Value *V : UniqueValues) {
10443 auto *PHI = dyn_cast<PHINode>(V);
10444 if (!PHI)
10445 continue;
10446
10447 ValueList Operands(PHI->getNumIncomingValues(), nullptr);
10448 for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) {
10449 Value *Op = PHI->getIncomingValue(I);
10450 Operands[I] = Op;
10451 }
10452 if (const TreeEntry *OpTE = getTreeEntry(Operands.front()))
10453 if (OpTE->isSame(Operands) && CountedOps.insert(OpTE).second)
10454 if (!OpTE->ReuseShuffleIndices.empty())
10455 ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
10456 OpTE->Scalars.size());
10457 }
10458
10459 return CommonCost - ScalarCost;
10460 }
10461 case Instruction::ExtractValue:
10462 case Instruction::ExtractElement: {
10463 auto GetScalarCost = [&](unsigned Idx) {
10464 auto *I = cast<Instruction>(UniqueValues[Idx]);
10465 VectorType *SrcVecTy;
10466 if (ShuffleOrOp == Instruction::ExtractElement) {
10467 auto *EE = cast<ExtractElementInst>(I);
10468 SrcVecTy = EE->getVectorOperandType();
10469 } else {
10470 auto *EV = cast<ExtractValueInst>(I);
10471 Type *AggregateTy = EV->getAggregateOperand()->getType();
10472 unsigned NumElts;
10473 if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))
10474 NumElts = ATy->getNumElements();
10475 else
10476 NumElts = AggregateTy->getStructNumElements();
10477 SrcVecTy = getWidenedType(OrigScalarTy, NumElts);
10478 }
10479 if (I->hasOneUse()) {
10480 Instruction *Ext = I->user_back();
10481 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
10483 // Use getExtractWithExtendCost() to calculate the cost of
10484 // extractelement/ext pair.
10486 Ext->getOpcode(), Ext->getType(), SrcVecTy, *getExtractIndex(I));
10487 // Subtract the cost of s|zext which is subtracted separately.
10488 Cost -= TTI->getCastInstrCost(
10489 Ext->getOpcode(), Ext->getType(), I->getType(),
10491 return Cost;
10492 }
10493 }
10494 return TTI->getVectorInstrCost(Instruction::ExtractElement, SrcVecTy,
10496 };
10497 auto GetVectorCost = [](InstructionCost CommonCost) { return CommonCost; };
10498 return GetCostDiff(GetScalarCost, GetVectorCost);
10499 }
10500 case Instruction::InsertElement: {
10501 assert(E->ReuseShuffleIndices.empty() &&
10502 "Unique insertelements only are expected.");
10503 auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
10504 unsigned const NumElts = SrcVecTy->getNumElements();
10505 unsigned const NumScalars = VL.size();
10506
10507 unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy);
10508
10509 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
10510 unsigned OffsetBeg = *getElementIndex(VL.front());
10511 unsigned OffsetEnd = OffsetBeg;
10512 InsertMask[OffsetBeg] = 0;
10513 for (auto [I, V] : enumerate(VL.drop_front())) {
10514 unsigned Idx = *getElementIndex(V);
10515 if (OffsetBeg > Idx)
10516 OffsetBeg = Idx;
10517 else if (OffsetEnd < Idx)
10518 OffsetEnd = Idx;
10519 InsertMask[Idx] = I + 1;
10520 }
10521 unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
10522 if (NumOfParts > 0 && NumOfParts < NumElts)
10523 VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
10524 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
10525 VecScalarsSz;
10526 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
10527 unsigned InsertVecSz = std::min<unsigned>(
10528 PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),
10529 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
10530 bool IsWholeSubvector =
10531 OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
10532 // Check if we can safely insert a subvector. If it is not possible, just
10533 // generate a whole-sized vector and shuffle the source vector and the new
10534 // subvector.
10535 if (OffsetBeg + InsertVecSz > VecSz) {
10536 // Align OffsetBeg to generate correct mask.
10537 OffsetBeg = alignDown(OffsetBeg, VecSz, Offset);
10538 InsertVecSz = VecSz;
10539 }
10540
10541 APInt DemandedElts = APInt::getZero(NumElts);
10542 // TODO: Add support for Instruction::InsertValue.
10544 if (!E->ReorderIndices.empty()) {
10545 inversePermutation(E->ReorderIndices, Mask);
10546 Mask.append(InsertVecSz - Mask.size(), PoisonMaskElem);
10547 } else {
10548 Mask.assign(VecSz, PoisonMaskElem);
10549 std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
10550 }
10551 bool IsIdentity = true;
10552 SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
10553 Mask.swap(PrevMask);
10554 for (unsigned I = 0; I < NumScalars; ++I) {
10555 unsigned InsertIdx = *getElementIndex(VL[PrevMask[I]]);
10556 DemandedElts.setBit(InsertIdx);
10557 IsIdentity &= InsertIdx - OffsetBeg == I;
10558 Mask[InsertIdx - OffsetBeg] = I;
10559 }
10560 assert(Offset < NumElts && "Failed to find vector index offset");
10561
10562 InstructionCost Cost = 0;
10563 Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts,
10564 /*Insert*/ true, /*Extract*/ false,
10565 CostKind);
10566
10567 // First cost - resize to actual vector size if not identity shuffle or
10568 // need to shift the vector.
10569 // Do not calculate the cost if the actual size is the register size and
10570 // we can merge this shuffle with the following SK_Select.
10571 auto *InsertVecTy = getWidenedType(ScalarTy, InsertVecSz);
10572 if (!IsIdentity)
10574 InsertVecTy, Mask);
10575 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
10576 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
10577 }));
10578 // Second cost - permutation with subvector, if some elements are from the
10579 // initial vector or inserting a subvector.
10580 // TODO: Implement the analysis of the FirstInsert->getOperand(0)
10581 // subvector of ActualVecTy.
10582 SmallBitVector InMask =
10583 isUndefVector(FirstInsert->getOperand(0),
10584 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
10585 if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
10586 if (InsertVecSz != VecSz) {
10587 auto *ActualVecTy = getWidenedType(ScalarTy, VecSz);
10588 Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, ActualVecTy, {},
10589 CostKind, OffsetBeg - Offset, InsertVecTy);
10590 } else {
10591 for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
10592 Mask[I] = InMask.test(I) ? PoisonMaskElem : I;
10593 for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
10594 I <= End; ++I)
10595 if (Mask[I] != PoisonMaskElem)
10596 Mask[I] = I + VecSz;
10597 for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
10598 Mask[I] =
10599 ((I >= InMask.size()) || InMask.test(I)) ? PoisonMaskElem : I;
10600 Cost +=
10601 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
10602 }
10603 }
10604 return Cost;
10605 }
10606 case Instruction::ZExt:
10607 case Instruction::SExt:
10608 case Instruction::FPToUI:
10609 case Instruction::FPToSI:
10610 case Instruction::FPExt:
10611 case Instruction::PtrToInt:
10612 case Instruction::IntToPtr:
10613 case Instruction::SIToFP:
10614 case Instruction::UIToFP:
10615 case Instruction::Trunc:
10616 case Instruction::FPTrunc:
10617 case Instruction::BitCast: {
10618 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
10619 Type *SrcScalarTy = VL0->getOperand(0)->getType();
10620 auto *SrcVecTy = getWidenedType(SrcScalarTy, VL.size());
10621 unsigned Opcode = ShuffleOrOp;
10622 unsigned VecOpcode = Opcode;
10623 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
10624 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
10625 // Check if the values are candidates to demote.
10626 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy->getScalarType());
10627 if (SrcIt != MinBWs.end()) {
10628 SrcBWSz = SrcIt->second.first;
10629 unsigned SrcScalarTyNumElements = getNumElements(SrcScalarTy);
10630 SrcScalarTy = IntegerType::get(F->getContext(), SrcBWSz);
10631 SrcVecTy =
10632 getWidenedType(SrcScalarTy, VL.size() * SrcScalarTyNumElements);
10633 }
10634 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
10635 if (BWSz == SrcBWSz) {
10636 VecOpcode = Instruction::BitCast;
10637 } else if (BWSz < SrcBWSz) {
10638 VecOpcode = Instruction::Trunc;
10639 } else if (It != MinBWs.end()) {
10640 assert(BWSz > SrcBWSz && "Invalid cast!");
10641 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
10642 } else if (SrcIt != MinBWs.end()) {
10643 assert(BWSz > SrcBWSz && "Invalid cast!");
10644 VecOpcode =
10645 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
10646 }
10647 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
10648 !SrcIt->second.second) {
10649 VecOpcode = Instruction::UIToFP;
10650 }
10651 auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
10652 auto *VI = cast<Instruction>(UniqueValues[Idx]);
10653 return TTI->getCastInstrCost(Opcode, VL0->getType(),
10654 VL0->getOperand(0)->getType(),
10656 };
10657 auto GetVectorCost = [=](InstructionCost CommonCost) {
10658 // Do not count cost here if minimum bitwidth is in effect and it is just
10659 // a bitcast (here it is just a noop).
10660 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
10661 return CommonCost;
10662 auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr;
10663 TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0));
10664 return CommonCost +
10665 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH, CostKind,
10666 VecOpcode == Opcode ? VI : nullptr);
10667 };
10668 return GetCostDiff(GetScalarCost, GetVectorCost);
10669 }
10670 case Instruction::FCmp:
10671 case Instruction::ICmp:
10672 case Instruction::Select: {
10673 CmpInst::Predicate VecPred, SwappedVecPred;
10674 auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value());
10675 if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) ||
10676 match(VL0, MatchCmp))
10677 SwappedVecPred = CmpInst::getSwappedPredicate(VecPred);
10678 else
10679 SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
10682 auto GetScalarCost = [&](unsigned Idx) {
10683 auto *VI = cast<Instruction>(UniqueValues[Idx]);
10684 CmpInst::Predicate CurrentPred = ScalarTy->isFloatingPointTy()
10687 auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
10688 if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) &&
10689 !match(VI, MatchCmp)) ||
10690 (CurrentPred != VecPred && CurrentPred != SwappedVecPred))
10691 VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
10694
10696 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
10697 CostKind, getOperandInfo(VI->getOperand(0)),
10698 getOperandInfo(VI->getOperand(1)), VI);
10699 InstructionCost IntrinsicCost = GetMinMaxCost(OrigScalarTy, VI);
10700 if (IntrinsicCost.isValid())
10701 ScalarCost = IntrinsicCost;
10702
10703 return ScalarCost;
10704 };
10705 auto GetVectorCost = [&](InstructionCost CommonCost) {
10706 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
10707
10708 InstructionCost VecCost =
10709 TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, VecPred,
10710 CostKind, getOperandInfo(E->getOperand(0)),
10711 getOperandInfo(E->getOperand(1)), VL0);
10712 if (auto *SI = dyn_cast<SelectInst>(VL0)) {
10713 auto *CondType =
10714 getWidenedType(SI->getCondition()->getType(), VL.size());
10715 unsigned CondNumElements = CondType->getNumElements();
10716 unsigned VecTyNumElements = getNumElements(VecTy);
10717 assert(VecTyNumElements >= CondNumElements &&
10718 VecTyNumElements % CondNumElements == 0 &&
10719 "Cannot vectorize Instruction::Select");
10720 if (CondNumElements != VecTyNumElements) {
10721 // When the return type is i1 but the source is fixed vector type, we
10722 // need to duplicate the condition value.
10723 VecCost += ::getShuffleCost(
10724 *TTI, TTI::SK_PermuteSingleSrc, CondType,
10725 createReplicatedMask(VecTyNumElements / CondNumElements,
10726 CondNumElements));
10727 }
10728 }
10729 return VecCost + CommonCost;
10730 };
10731 return GetCostDiff(GetScalarCost, GetVectorCost);
10732 }
10733 case TreeEntry::MinMax: {
10734 auto GetScalarCost = [&](unsigned Idx) {
10735 return GetMinMaxCost(OrigScalarTy);
10736 };
10737 auto GetVectorCost = [&](InstructionCost CommonCost) {
10738 InstructionCost VecCost = GetMinMaxCost(VecTy);
10739 return VecCost + CommonCost;
10740 };
10741 return GetCostDiff(GetScalarCost, GetVectorCost);
10742 }
10743 case Instruction::FNeg:
10744 case Instruction::Add:
10745 case Instruction::FAdd:
10746 case Instruction::Sub:
10747 case Instruction::FSub:
10748 case Instruction::Mul:
10749 case Instruction::FMul:
10750 case Instruction::UDiv:
10751 case Instruction::SDiv:
10752 case Instruction::FDiv:
10753 case Instruction::URem:
10754 case Instruction::SRem:
10755 case Instruction::FRem:
10756 case Instruction::Shl:
10757 case Instruction::LShr:
10758 case Instruction::AShr:
10759 case Instruction::And:
10760 case Instruction::Or:
10761 case Instruction::Xor: {
10762 auto GetScalarCost = [&](unsigned Idx) {
10763 auto *VI = cast<Instruction>(UniqueValues[Idx]);
10764 unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
10765 TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(VI->getOperand(0));
10766 TTI::OperandValueInfo Op2Info =
10767 TTI::getOperandInfo(VI->getOperand(OpIdx));
10768 SmallVector<const Value *> Operands(VI->operand_values());
10769 return TTI->getArithmeticInstrCost(ShuffleOrOp, OrigScalarTy, CostKind,
10770 Op1Info, Op2Info, Operands, VI);
10771 };
10772 auto GetVectorCost = [=](InstructionCost CommonCost) {
10773 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
10774 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
10775 ArrayRef<Value *> Ops = E->getOperand(I);
10776 if (all_of(Ops, [&](Value *Op) {
10777 auto *CI = dyn_cast<ConstantInt>(Op);
10778 return CI && CI->getValue().countr_one() >= It->second.first;
10779 }))
10780 return CommonCost;
10781 }
10782 }
10783 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
10784 TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
10785 TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
10786 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,
10787 Op2Info, {}, nullptr, TLI) +
10788 CommonCost;
10789 };
10790 return GetCostDiff(GetScalarCost, GetVectorCost);
10791 }
10792 case Instruction::GetElementPtr: {
10793 return CommonCost + GetGEPCostDiff(VL, VL0);
10794 }
10795 case Instruction::Load: {
10796 auto GetScalarCost = [&](unsigned Idx) {
10797 auto *VI = cast<LoadInst>(UniqueValues[Idx]);
10798 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
10799 VI->getAlign(), VI->getPointerAddressSpace(),
10801 };
10802 auto *LI0 = cast<LoadInst>(VL0);
10803 auto GetVectorCost = [&](InstructionCost CommonCost) {
10804 InstructionCost VecLdCost;
10805 if (E->State == TreeEntry::Vectorize) {
10806 VecLdCost = TTI->getMemoryOpCost(
10807 Instruction::Load, VecTy, LI0->getAlign(),
10808 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
10809 } else if (E->State == TreeEntry::StridedVectorize) {
10810 Align CommonAlignment =
10811 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
10812 VecLdCost = TTI->getStridedMemoryOpCost(
10813 Instruction::Load, VecTy, LI0->getPointerOperand(),
10814 /*VariableMask=*/false, CommonAlignment, CostKind);
10815 } else {
10816 assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState");
10817 Align CommonAlignment =
10818 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
10819 VecLdCost = TTI->getGatherScatterOpCost(
10820 Instruction::Load, VecTy, LI0->getPointerOperand(),
10821 /*VariableMask=*/false, CommonAlignment, CostKind);
10822 }
10823 return VecLdCost + CommonCost;
10824 };
10825
10826 InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
10827 // If this node generates masked gather load then it is not a terminal node.
10828 // Hence address operand cost is estimated separately.
10829 if (E->State == TreeEntry::ScatterVectorize)
10830 return Cost;
10831
10832 // Estimate cost of GEPs since this tree node is a terminator.
10833 SmallVector<Value *> PointerOps(VL.size());
10834 for (auto [I, V] : enumerate(VL))
10835 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
10836 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
10837 }
10838 case Instruction::Store: {
10839 bool IsReorder = !E->ReorderIndices.empty();
10840 auto GetScalarCost = [=](unsigned Idx) {
10841 auto *VI = cast<StoreInst>(VL[Idx]);
10842 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand());
10843 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
10844 VI->getAlign(), VI->getPointerAddressSpace(),
10845 CostKind, OpInfo, VI);
10846 };
10847 auto *BaseSI =
10848 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
10849 auto GetVectorCost = [=](InstructionCost CommonCost) {
10850 // We know that we can merge the stores. Calculate the cost.
10851 InstructionCost VecStCost;
10852 if (E->State == TreeEntry::StridedVectorize) {
10853 Align CommonAlignment =
10854 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
10855 VecStCost = TTI->getStridedMemoryOpCost(
10856 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
10857 /*VariableMask=*/false, CommonAlignment, CostKind);
10858 } else {
10859 assert(E->State == TreeEntry::Vectorize &&
10860 "Expected either strided or consecutive stores.");
10861 TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
10862 VecStCost = TTI->getMemoryOpCost(
10863 Instruction::Store, VecTy, BaseSI->getAlign(),
10864 BaseSI->getPointerAddressSpace(), CostKind, OpInfo);
10865 }
10866 return VecStCost + CommonCost;
10867 };
10868 SmallVector<Value *> PointerOps(VL.size());
10869 for (auto [I, V] : enumerate(VL)) {
10870 unsigned Idx = IsReorder ? E->ReorderIndices[I] : I;
10871 PointerOps[Idx] = cast<StoreInst>(V)->getPointerOperand();
10872 }
10873
10874 return GetCostDiff(GetScalarCost, GetVectorCost) +
10875 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
10876 }
10877 case Instruction::Call: {
10878 auto GetScalarCost = [&](unsigned Idx) {
10879 auto *CI = cast<CallInst>(UniqueValues[Idx]);
10881 if (ID != Intrinsic::not_intrinsic) {
10882 IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
10883 return TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
10884 }
10887 CI->getFunctionType()->params(), CostKind);
10888 };
10889 auto GetVectorCost = [=](InstructionCost CommonCost) {
10890 auto *CI = cast<CallInst>(VL0);
10892 SmallVector<Type *> ArgTys =
10893 buildIntrinsicArgTypes(CI, ID, VecTy->getNumElements(),
10894 It != MinBWs.end() ? It->second.first : 0);
10895 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
10896 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
10897 };
10898 return GetCostDiff(GetScalarCost, GetVectorCost);
10899 }
10900 case Instruction::ShuffleVector: {
10901 if (!SLPReVec || E->isAltShuffle())
10902 assert(E->isAltShuffle() &&
10903 ((Instruction::isBinaryOp(E->getOpcode()) &&
10904 Instruction::isBinaryOp(E->getAltOpcode())) ||
10905 (Instruction::isCast(E->getOpcode()) &&
10906 Instruction::isCast(E->getAltOpcode())) ||
10907 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
10908 "Invalid Shuffle Vector Operand");
10909 // Try to find the previous shuffle node with the same operands and same
10910 // main/alternate ops.
10911 auto TryFindNodeWithEqualOperands = [=]() {
10912 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
10913 if (TE.get() == E)
10914 break;
10915 if (TE->isAltShuffle() &&
10916 ((TE->getOpcode() == E->getOpcode() &&
10917 TE->getAltOpcode() == E->getAltOpcode()) ||
10918 (TE->getOpcode() == E->getAltOpcode() &&
10919 TE->getAltOpcode() == E->getOpcode())) &&
10920 TE->hasEqualOperands(*E))
10921 return true;
10922 }
10923 return false;
10924 };
10925 auto GetScalarCost = [&](unsigned Idx) {
10926 auto *VI = cast<Instruction>(UniqueValues[Idx]);
10927 assert(E->isOpcodeOrAlt(VI) && "Unexpected main/alternate opcode");
10928 (void)E;
10929 return TTI->getInstructionCost(VI, CostKind);
10930 };
10931 // Need to clear CommonCost since the final shuffle cost is included into
10932 // vector cost.
10933 auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {
10934 // VecCost is equal to sum of the cost of creating 2 vectors
10935 // and the cost of creating shuffle.
10936 InstructionCost VecCost = 0;
10937 if (TryFindNodeWithEqualOperands()) {
10938 LLVM_DEBUG({
10939 dbgs() << "SLP: diamond match for alternate node found.\n";
10940 E->dump();
10941 });
10942 // No need to add new vector costs here since we're going to reuse
10943 // same main/alternate vector ops, just do different shuffling.
10944 } else if (Instruction::isBinaryOp(E->getOpcode())) {
10945 VecCost =
10946 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
10947 VecCost +=
10948 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind);
10949 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
10950 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
10951 VecCost = TTIRef.getCmpSelInstrCost(
10952 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(), CostKind,
10953 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
10954 VL0);
10955 VecCost += TTIRef.getCmpSelInstrCost(
10956 E->getOpcode(), VecTy, MaskTy,
10957 cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
10958 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
10959 E->getAltOp());
10960 } else {
10961 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
10962 auto *SrcTy = getWidenedType(SrcSclTy, VL.size());
10963 if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
10964 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
10965 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
10966 unsigned SrcBWSz =
10967 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
10968 if (SrcIt != MinBWs.end()) {
10969 SrcBWSz = SrcIt->second.first;
10970 SrcSclTy = IntegerType::get(SrcSclTy->getContext(), SrcBWSz);
10971 SrcTy = getWidenedType(SrcSclTy, VL.size());
10972 }
10973 if (BWSz <= SrcBWSz) {
10974 if (BWSz < SrcBWSz)
10975 VecCost =
10976 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
10978 LLVM_DEBUG({
10979 dbgs()
10980 << "SLP: alternate extension, which should be truncated.\n";
10981 E->dump();
10982 });
10983 return VecCost;
10984 }
10985 }
10986 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
10988 VecCost +=
10989 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
10991 }
10993 E->buildAltOpShuffleMask(
10994 [&](Instruction *I) {
10995 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
10996 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
10997 *TLI);
10998 },
10999 Mask);
11001 FinalVecTy, Mask, CostKind);
11002 // Patterns like [fadd,fsub] can be combined into a single instruction
11003 // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
11004 // need to take into account their order when looking for the most used
11005 // order.
11006 unsigned Opcode0 = E->getOpcode();
11007 unsigned Opcode1 = E->getAltOpcode();
11008 SmallBitVector OpcodeMask(getAltInstrMask(E->Scalars, Opcode0, Opcode1));
11009 // If this pattern is supported by the target then we consider the
11010 // order.
11011 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
11012 InstructionCost AltVecCost = TTIRef.getAltInstrCost(
11013 VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
11014 return AltVecCost < VecCost ? AltVecCost : VecCost;
11015 }
11016 // TODO: Check the reverse order too.
11017 return VecCost;
11018 };
11019 if (SLPReVec && !E->isAltShuffle())
11020 return GetCostDiff(
11021 GetScalarCost, [&](InstructionCost) -> InstructionCost {
11022 // If a group uses mask in order, the shufflevector can be
11023 // eliminated by instcombine. Then the cost is 0.
11025 "Not supported shufflevector usage.");
11026 auto *SV = cast<ShuffleVectorInst>(VL.front());
11027 unsigned SVNumElements =
11028 cast<FixedVectorType>(SV->getOperand(0)->getType())
11029 ->getNumElements();
11030 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
11031 for (size_t I = 0, End = VL.size(); I != End; I += GroupSize) {
11032 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
11033 int NextIndex = 0;
11034 if (!all_of(Group, [&](Value *V) {
11036 "Not supported shufflevector usage.");
11037 auto *SV = cast<ShuffleVectorInst>(V);
11038 int Index;
11039 [[maybe_unused]] bool isExtractSubvectorMask =
11040 SV->isExtractSubvectorMask(Index);
11041 assert(isExtractSubvectorMask &&
11042 "Not supported shufflevector usage.");
11043 if (NextIndex != Index)
11044 return false;
11045 NextIndex += SV->getShuffleMask().size();
11046 return true;
11047 }))
11048 return ::getShuffleCost(
11050 calculateShufflevectorMask(E->Scalars));
11051 }
11052 return TTI::TCC_Free;
11053 });
11054 return GetCostDiff(GetScalarCost, GetVectorCost);
11055 }
11056 case Instruction::Freeze:
11057 return CommonCost;
11058 default:
11059 llvm_unreachable("Unknown instruction");
11060 }
11061}
11062
11063bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
11064 LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
11065 << VectorizableTree.size() << " is fully vectorizable .\n");
11066
11067 auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {
11069 return TE->isGather() &&
11070 !any_of(TE->Scalars,
11071 [this](Value *V) { return EphValues.contains(V); }) &&
11072 (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
11073 TE->Scalars.size() < Limit ||
11074 ((TE->getOpcode() == Instruction::ExtractElement ||
11076 isFixedVectorShuffle(TE->Scalars, Mask)) ||
11077 (TE->getOpcode() == Instruction::Load && !TE->isAltShuffle()) ||
11078 any_of(TE->Scalars, IsaPred<LoadInst>));
11079 };
11080
11081 // We only handle trees of heights 1 and 2.
11082 if (VectorizableTree.size() == 1 &&
11083 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
11084 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
11085 (ForReduction &&
11086 AreVectorizableGathers(VectorizableTree[0].get(),
11087 VectorizableTree[0]->Scalars.size()) &&
11088 VectorizableTree[0]->getVectorFactor() > 2)))
11089 return true;
11090
11091 if (VectorizableTree.size() != 2)
11092 return false;
11093
11094 // Handle splat and all-constants stores. Also try to vectorize tiny trees
11095 // with the second gather nodes if they have less scalar operands rather than
11096 // the initial tree element (may be profitable to shuffle the second gather)
11097 // or they are extractelements, which form shuffle.
11099 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
11100 AreVectorizableGathers(VectorizableTree[1].get(),
11101 VectorizableTree[0]->Scalars.size()))
11102 return true;
11103
11104 // Gathering cost would be too much for tiny trees.
11105 if (VectorizableTree[0]->isGather() ||
11106 (VectorizableTree[1]->isGather() &&
11107 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
11108 VectorizableTree[0]->State != TreeEntry::StridedVectorize))
11109 return false;
11110
11111 return true;
11112}
11113
11114static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
11116 bool MustMatchOrInst) {
11117 // Look past the root to find a source value. Arbitrarily follow the
11118 // path through operand 0 of any 'or'. Also, peek through optional
11119 // shift-left-by-multiple-of-8-bits.
11120 Value *ZextLoad = Root;
11121 const APInt *ShAmtC;
11122 bool FoundOr = false;
11123 while (!isa<ConstantExpr>(ZextLoad) &&
11124 (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
11125 (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
11126 ShAmtC->urem(8) == 0))) {
11127 auto *BinOp = cast<BinaryOperator>(ZextLoad);
11128 ZextLoad = BinOp->getOperand(0);
11129 if (BinOp->getOpcode() == Instruction::Or)
11130 FoundOr = true;
11131 }
11132 // Check if the input is an extended load of the required or/shift expression.
11133 Value *Load;
11134 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
11135 !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load))
11136 return false;
11137
11138 // Require that the total load bit width is a legal integer type.
11139 // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
11140 // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
11141 Type *SrcTy = Load->getType();
11142 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
11143 if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
11144 return false;
11145
11146 // Everything matched - assume that we can fold the whole sequence using
11147 // load combining.
11148 LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
11149 << *(cast<Instruction>(Root)) << "\n");
11150
11151 return true;
11152}
11153
11155 if (RdxKind != RecurKind::Or)
11156 return false;
11157
11158 unsigned NumElts = VectorizableTree[0]->Scalars.size();
11159 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
11160 return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI,
11161 /* MatchOr */ false);
11162}
11163
11165 // Peek through a final sequence of stores and check if all operations are
11166 // likely to be load-combined.
11167 unsigned NumElts = Stores.size();
11168 for (Value *Scalar : Stores) {
11169 Value *X;
11170 if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
11171 !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true))
11172 return false;
11173 }
11174 return true;
11175}
11176
11177bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
11178 // No need to vectorize inserts of gathered values.
11179 if (VectorizableTree.size() == 2 &&
11180 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
11181 VectorizableTree[1]->isGather() &&
11182 (VectorizableTree[1]->getVectorFactor() <= 2 ||
11183 !(isSplat(VectorizableTree[1]->Scalars) ||
11184 allConstant(VectorizableTree[1]->Scalars))))
11185 return true;
11186
11187 // If the graph includes only PHI nodes and gathers, it is defnitely not
11188 // profitable for the vectorization, we can skip it, if the cost threshold is
11189 // default. The cost of vectorized PHI nodes is almost always 0 + the cost of
11190 // gathers/buildvectors.
11191 constexpr int Limit = 4;
11192 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
11193 !VectorizableTree.empty() &&
11194 all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
11195 return (TE->isGather() &&
11196 TE->getOpcode() != Instruction::ExtractElement &&
11197 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
11198 TE->getOpcode() == Instruction::PHI;
11199 }))
11200 return true;
11201
11202 // We can vectorize the tree if its size is greater than or equal to the
11203 // minimum size specified by the MinTreeSize command line option.
11204 if (VectorizableTree.size() >= MinTreeSize)
11205 return false;
11206
11207 // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
11208 // can vectorize it if we can prove it fully vectorizable.
11209 if (isFullyVectorizableTinyTree(ForReduction))
11210 return false;
11211
11212 // Check if any of the gather node forms an insertelement buildvector
11213 // somewhere.
11214 bool IsAllowedSingleBVNode =
11215 VectorizableTree.size() > 1 ||
11216 (VectorizableTree.size() == 1 && VectorizableTree.front()->getOpcode() &&
11217 !VectorizableTree.front()->isAltShuffle() &&
11218 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
11219 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
11220 allSameBlock(VectorizableTree.front()->Scalars));
11221 if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
11222 return TE->isGather() && all_of(TE->Scalars, [&](Value *V) {
11223 return isa<ExtractElementInst, UndefValue>(V) ||
11224 (IsAllowedSingleBVNode &&
11225 !V->hasNUsesOrMore(UsesLimit) &&
11226 any_of(V->users(), IsaPred<InsertElementInst>));
11227 });
11228 }))
11229 return false;
11230
11231 assert(VectorizableTree.empty()
11232 ? ExternalUses.empty()
11233 : true && "We shouldn't have any external users");
11234
11235 // Otherwise, we can't vectorize the tree. It is both tiny and not fully
11236 // vectorizable.
11237 return true;
11238}
11239
11241 // Walk from the bottom of the tree to the top, tracking which values are
11242 // live. When we see a call instruction that is not part of our tree,
11243 // query TTI to see if there is a cost to keeping values live over it
11244 // (for example, if spills and fills are required).
11245 unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
11247
11249 Instruction *PrevInst = nullptr;
11250
11251 // The entries in VectorizableTree are not necessarily ordered by their
11252 // position in basic blocks. Collect them and order them by dominance so later
11253 // instructions are guaranteed to be visited first. For instructions in
11254 // different basic blocks, we only scan to the beginning of the block, so
11255 // their order does not matter, as long as all instructions in a basic block
11256 // are grouped together. Using dominance ensures a deterministic order.
11257 SmallVector<Instruction *, 16> OrderedScalars;
11258 for (const auto &TEPtr : VectorizableTree) {
11259 if (TEPtr->State != TreeEntry::Vectorize)
11260 continue;
11261 Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
11262 if (!Inst)
11263 continue;
11264 OrderedScalars.push_back(Inst);
11265 }
11266 llvm::sort(OrderedScalars, [&](Instruction *A, Instruction *B) {
11267 auto *NodeA = DT->getNode(A->getParent());
11268 auto *NodeB = DT->getNode(B->getParent());
11269 assert(NodeA && "Should only process reachable instructions");
11270 assert(NodeB && "Should only process reachable instructions");
11271 assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11272 "Different nodes should have different DFS numbers");
11273 if (NodeA != NodeB)
11274 return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
11275 return B->comesBefore(A);
11276 });
11277
11278 for (Instruction *Inst : OrderedScalars) {
11279 if (!PrevInst) {
11280 PrevInst = Inst;
11281 continue;
11282 }
11283
11284 // Update LiveValues.
11285 LiveValues.erase(PrevInst);
11286 for (auto &J : PrevInst->operands()) {
11287 if (isa<Instruction>(&*J) && getTreeEntry(&*J))
11288 LiveValues.insert(cast<Instruction>(&*J));
11289 }
11290
11291 LLVM_DEBUG({
11292 dbgs() << "SLP: #LV: " << LiveValues.size();
11293 for (auto *X : LiveValues)
11294 dbgs() << " " << X->getName();
11295 dbgs() << ", Looking at ";
11296 Inst->dump();
11297 });
11298
11299 // Now find the sequence of instructions between PrevInst and Inst.
11300 unsigned NumCalls = 0;
11301 BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
11302 PrevInstIt =
11303 PrevInst->getIterator().getReverse();
11304 while (InstIt != PrevInstIt) {
11305 if (PrevInstIt == PrevInst->getParent()->rend()) {
11306 PrevInstIt = Inst->getParent()->rbegin();
11307 continue;
11308 }
11309
11310 auto NoCallIntrinsic = [this](Instruction *I) {
11311 if (auto *II = dyn_cast<IntrinsicInst>(I)) {
11312 if (II->isAssumeLikeIntrinsic())
11313 return true;
11314 FastMathFlags FMF;
11316 for (auto &ArgOp : II->args())
11317 Tys.push_back(ArgOp->getType());
11318 if (auto *FPMO = dyn_cast<FPMathOperator>(II))
11319 FMF = FPMO->getFastMathFlags();
11320 IntrinsicCostAttributes ICA(II->getIntrinsicID(), II->getType(), Tys,
11321 FMF);
11322 InstructionCost IntrCost =
11325 nullptr, II->getType(), Tys, TTI::TCK_RecipThroughput);
11326 if (IntrCost < CallCost)
11327 return true;
11328 }
11329 return false;
11330 };
11331
11332 // Debug information does not impact spill cost.
11333 if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
11334 &*PrevInstIt != PrevInst)
11335 NumCalls++;
11336
11337 ++PrevInstIt;
11338 }
11339
11340 if (NumCalls) {
11342 for (auto *II : LiveValues) {
11343 auto *ScalarTy = II->getType();
11344 if (auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
11345 ScalarTy = VectorTy->getElementType();
11346 V.push_back(getWidenedType(ScalarTy, BundleWidth));
11347 }
11348 Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V);
11349 }
11350
11351 PrevInst = Inst;
11352 }
11353
11354 return Cost;
11355}
11356
11357/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
11358/// buildvector sequence.
11360 const InsertElementInst *IE2) {
11361 if (IE1 == IE2)
11362 return false;
11363 const auto *I1 = IE1;
11364 const auto *I2 = IE2;
11365 const InsertElementInst *PrevI1;
11366 const InsertElementInst *PrevI2;
11367 unsigned Idx1 = *getElementIndex(IE1);
11368 unsigned Idx2 = *getElementIndex(IE2);
11369 do {
11370 if (I2 == IE1)
11371 return true;
11372 if (I1 == IE2)
11373 return false;
11374 PrevI1 = I1;
11375 PrevI2 = I2;
11376 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
11377 getElementIndex(I1).value_or(Idx2) != Idx2)
11378 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
11379 if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&
11380 getElementIndex(I2).value_or(Idx1) != Idx1)
11381 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
11382 } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
11383 llvm_unreachable("Two different buildvectors not expected.");
11384}
11385
11386namespace {
11387/// Returns incoming Value *, if the requested type is Value * too, or a default
11388/// value, otherwise.
11389struct ValueSelect {
11390 template <typename U>
11391 static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) {
11392 return V;
11393 }
11394 template <typename U>
11395 static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) {
11396 return U();
11397 }
11398};
11399} // namespace
11400
11401/// Does the analysis of the provided shuffle masks and performs the requested
11402/// actions on the vectors with the given shuffle masks. It tries to do it in
11403/// several steps.
11404/// 1. If the Base vector is not undef vector, resizing the very first mask to
11405/// have common VF and perform action for 2 input vectors (including non-undef
11406/// Base). Other shuffle masks are combined with the resulting after the 1 stage
11407/// and processed as a shuffle of 2 elements.
11408/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
11409/// action only for 1 vector with the given mask, if it is not the identity
11410/// mask.
11411/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
11412/// vectors, combing the masks properly between the steps.
11413template <typename T>
11415 MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
11416 function_ref<unsigned(T *)> GetVF,
11417 function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction,
11419 assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
11420 SmallVector<int> Mask(ShuffleMask.begin()->second);
11421 auto VMIt = std::next(ShuffleMask.begin());
11422 T *Prev = nullptr;
11423 SmallBitVector UseMask =
11424 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
11425 SmallBitVector IsBaseUndef = isUndefVector(Base, UseMask);
11426 if (!IsBaseUndef.all()) {
11427 // Base is not undef, need to combine it with the next subvectors.
11428 std::pair<T *, bool> Res =
11429 ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);
11430 SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask);
11431 for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
11432 if (Mask[Idx] == PoisonMaskElem)
11433 Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
11434 else
11435 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
11436 }
11437 auto *V = ValueSelect::get<T *>(Base);
11438 (void)V;
11439 assert((!V || GetVF(V) == Mask.size()) &&
11440 "Expected base vector of VF number of elements.");
11441 Prev = Action(Mask, {nullptr, Res.first});
11442 } else if (ShuffleMask.size() == 1) {
11443 // Base is undef and only 1 vector is shuffled - perform the action only for
11444 // single vector, if the mask is not the identity mask.
11445 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
11446 /*ForSingleMask=*/true);
11447 if (Res.second)
11448 // Identity mask is found.
11449 Prev = Res.first;
11450 else
11451 Prev = Action(Mask, {ShuffleMask.begin()->first});
11452 } else {
11453 // Base is undef and at least 2 input vectors shuffled - perform 2 vectors
11454 // shuffles step by step, combining shuffle between the steps.
11455 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
11456 unsigned Vec2VF = GetVF(VMIt->first);
11457 if (Vec1VF == Vec2VF) {
11458 // No need to resize the input vectors since they are of the same size, we
11459 // can shuffle them directly.
11460 ArrayRef<int> SecMask = VMIt->second;
11461 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
11462 if (SecMask[I] != PoisonMaskElem) {
11463 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
11464 Mask[I] = SecMask[I] + Vec1VF;
11465 }
11466 }
11467 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
11468 } else {
11469 // Vectors of different sizes - resize and reshuffle.
11470 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
11471 /*ForSingleMask=*/false);
11472 std::pair<T *, bool> Res2 =
11473 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
11474 ArrayRef<int> SecMask = VMIt->second;
11475 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
11476 if (Mask[I] != PoisonMaskElem) {
11477 assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
11478 if (Res1.second)
11479 Mask[I] = I;
11480 } else if (SecMask[I] != PoisonMaskElem) {
11481 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
11482 Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
11483 }
11484 }
11485 Prev = Action(Mask, {Res1.first, Res2.first});
11486 }
11487 VMIt = std::next(VMIt);
11488 }
11489 bool IsBaseNotUndef = !IsBaseUndef.all();
11490 (void)IsBaseNotUndef;
11491 // Perform requested actions for the remaining masks/vectors.
11492 for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
11493 // Shuffle other input vectors, if any.
11494 std::pair<T *, bool> Res =
11495 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
11496 ArrayRef<int> SecMask = VMIt->second;
11497 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
11498 if (SecMask[I] != PoisonMaskElem) {
11499 assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) &&
11500 "Multiple uses of scalars.");
11501 Mask[I] = (Res.second ? I : SecMask[I]) + VF;
11502 } else if (Mask[I] != PoisonMaskElem) {
11503 Mask[I] = I;
11504 }
11505 }
11506 Prev = Action(Mask, {Prev, Res.first});
11507 }
11508 return Prev;
11509}
11510
11511namespace {
11512/// Data type for handling buildvector sequences with the reused scalars from
11513/// other tree entries.
11514template <typename T> struct ShuffledInsertData {
11515 /// List of insertelements to be replaced by shuffles.
11516 SmallVector<InsertElementInst *> InsertElements;
11517 /// The parent vectors and shuffle mask for the given list of inserts.
11519};
11520} // namespace
11521
11524 LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
11525 << VectorizableTree.size() << ".\n");
11526
11527 unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
11528
11529 SmallPtrSet<Value *, 4> CheckedExtracts;
11530 for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
11531 TreeEntry &TE = *VectorizableTree[I];
11532 // No need to count the cost for combined entries, they are combined and
11533 // just skip their cost.
11534 if (TE.State == TreeEntry::CombinedVectorize) {
11535 LLVM_DEBUG(
11536 dbgs() << "SLP: Skipping cost for combined node that starts with "
11537 << *TE.Scalars[0] << ".\n";
11538 TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
11539 continue;
11540 }
11541 if (TE.isGather()) {
11542 if (const TreeEntry *E = getTreeEntry(TE.getMainOp());
11543 E && E->getVectorFactor() == TE.getVectorFactor() &&
11544 E->isSame(TE.Scalars)) {
11545 // Some gather nodes might be absolutely the same as some vectorizable
11546 // nodes after reordering, need to handle it.
11547 LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
11548 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
11549 << "SLP: Current total cost = " << Cost << "\n");
11550 continue;
11551 }
11552 }
11553
11554 // Exclude cost of gather loads nodes which are not used. These nodes were
11555 // built as part of the final attempt to vectorize gathered loads.
11556 assert((!TE.isGather() || TE.Idx == 0 || !TE.UserTreeIndices.empty()) &&
11557 "Expected gather nodes with users only.");
11558
11559 InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
11560 Cost += C;
11561 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
11562 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
11563 << "SLP: Current total cost = " << Cost << "\n");
11564 }
11565
11566 SmallPtrSet<Value *, 16> ExtractCostCalculated;
11567 InstructionCost ExtractCost = 0;
11569 SmallVector<APInt> DemandedElts;
11570 SmallDenseSet<Value *, 4> UsedInserts;
11572 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
11574 for (ExternalUser &EU : ExternalUses) {
11575 // Uses by ephemeral values are free (because the ephemeral value will be
11576 // removed prior to code generation, and so the extraction will be
11577 // removed as well) as well as uses in unreachable blocks or in landing pads
11578 // (rarely executed).
11579 if (EphValues.count(EU.User) ||
11580 (EU.User &&
11581 (!DT->isReachableFromEntry(cast<Instruction>(EU.User)->getParent()) ||
11582 cast<Instruction>(EU.User)->getParent()->isLandingPad())))
11583 continue;
11584
11585 // We only add extract cost once for the same scalar.
11586 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
11587 !ExtractCostCalculated.insert(EU.Scalar).second)
11588 continue;
11589
11590 // No extract cost for vector "scalar"
11591 if (isa<FixedVectorType>(EU.Scalar->getType()))
11592 continue;
11593
11594 // If found user is an insertelement, do not calculate extract cost but try
11595 // to detect it as a final shuffled/identity match.
11596 if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);
11597 VU && VU->getOperand(1) == EU.Scalar) {
11598 if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
11599 if (!UsedInserts.insert(VU).second)
11600 continue;
11601 std::optional<unsigned> InsertIdx = getElementIndex(VU);
11602 if (InsertIdx) {
11603 const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
11604 auto *It = find_if(
11605 ShuffledInserts,
11606 [this, VU](const ShuffledInsertData<const TreeEntry *> &Data) {
11607 // Checks if 2 insertelements are from the same buildvector.
11608 InsertElementInst *VecInsert = Data.InsertElements.front();
11610 VU, VecInsert, [this](InsertElementInst *II) -> Value * {
11611 Value *Op0 = II->getOperand(0);
11612 if (getTreeEntry(II) && !getTreeEntry(Op0))
11613 return nullptr;
11614 return Op0;
11615 });
11616 });
11617 int VecId = -1;
11618 if (It == ShuffledInserts.end()) {
11619 auto &Data = ShuffledInserts.emplace_back();
11620 Data.InsertElements.emplace_back(VU);
11621 DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
11622 VecId = ShuffledInserts.size() - 1;
11623 auto It = MinBWs.find(ScalarTE);
11624 if (It != MinBWs.end() &&
11625 VectorCasts
11626 .insert(std::make_pair(ScalarTE, FTy->getElementType()))
11627 .second) {
11628 unsigned BWSz = It->second.first;
11629 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
11630 unsigned VecOpcode;
11631 if (DstBWSz < BWSz)
11632 VecOpcode = Instruction::Trunc;
11633 else
11634 VecOpcode =
11635 It->second.second ? Instruction::SExt : Instruction::ZExt;
11638 VecOpcode, FTy,
11639 getWidenedType(IntegerType::get(FTy->getContext(), BWSz),
11640 FTy->getNumElements()),
11642 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
11643 << " for extending externally used vector with "
11644 "non-equal minimum bitwidth.\n");
11645 Cost += C;
11646 }
11647 } else {
11648 if (isFirstInsertElement(VU, It->InsertElements.front()))
11649 It->InsertElements.front() = VU;
11650 VecId = std::distance(ShuffledInserts.begin(), It);
11651 }
11652 int InIdx = *InsertIdx;
11653 SmallVectorImpl<int> &Mask =
11654 ShuffledInserts[VecId].ValueMasks[ScalarTE];
11655 if (Mask.empty())
11656 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
11657 Mask[InIdx] = EU.Lane;
11658 DemandedElts[VecId].setBit(InIdx);
11659 continue;
11660 }
11661 }
11662 }
11663
11665 // If we plan to rewrite the tree in a smaller type, we will need to sign
11666 // extend the extracted value back to the original type. Here, we account
11667 // for the extract and the added cost of the sign extend if needed.
11668 InstructionCost ExtraCost = TTI::TCC_Free;
11669 auto *VecTy = getWidenedType(EU.Scalar->getType(), BundleWidth);
11670 const TreeEntry *Entry = getTreeEntry(EU.Scalar);
11671 auto It = MinBWs.find(Entry);
11672 if (It != MinBWs.end()) {
11673 auto *MinTy = IntegerType::get(F->getContext(), It->second.first);
11674 unsigned Extend =
11675 It->second.second ? Instruction::SExt : Instruction::ZExt;
11676 VecTy = getWidenedType(MinTy, BundleWidth);
11677 ExtraCost = TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
11678 VecTy, EU.Lane);
11679 } else {
11680 ExtraCost = TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
11681 CostKind, EU.Lane);
11682 }
11683 // Leave the scalar instructions as is if they are cheaper than extracts.
11684 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
11685 Entry->getOpcode() == Instruction::Load) {
11686 // Checks if the user of the external scalar is phi in loop body.
11687 auto IsPhiInLoop = [&](const ExternalUser &U) {
11688 if (auto *Phi = dyn_cast_if_present<PHINode>(U.User)) {
11689 auto *I = cast<Instruction>(U.Scalar);
11690 const Loop *L = LI->getLoopFor(Phi->getParent());
11691 return L && (Phi->getParent() == I->getParent() ||
11692 L == LI->getLoopFor(I->getParent()));
11693 }
11694 return false;
11695 };
11696 if (!ValueToExtUses) {
11697 ValueToExtUses.emplace();
11698 for_each(enumerate(ExternalUses), [&](const auto &P) {
11699 // Ignore phis in loops.
11700 if (IsPhiInLoop(P.value()))
11701 return;
11702
11703 ValueToExtUses->try_emplace(P.value().Scalar, P.index());
11704 });
11705 }
11706 // Can use original instruction, if no operands vectorized or they are
11707 // marked as externally used already.
11708 auto *Inst = cast<Instruction>(EU.Scalar);
11709 bool CanBeUsedAsScalar = all_of(Inst->operands(), [&](Value *V) {
11710 if (!getTreeEntry(V)) {
11711 // Some extractelements might be not vectorized, but
11712 // transformed into shuffle and removed from the function,
11713 // consider it here.
11714 if (auto *EE = dyn_cast<ExtractElementInst>(V))
11715 return !EE->hasOneUse() || !MustGather.contains(EE);
11716 return true;
11717 }
11718 return ValueToExtUses->contains(V);
11719 });
11720 if (CanBeUsedAsScalar) {
11721 InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
11722 bool KeepScalar = ScalarCost <= ExtraCost;
11723 // Try to keep original scalar if the user is the phi node from the same
11724 // block as the root phis, currently vectorized. It allows to keep
11725 // better ordering info of PHIs, being vectorized currently.
11726 bool IsProfitablePHIUser =
11727 (KeepScalar || (ScalarCost - ExtraCost <= TTI::TCC_Basic &&
11728 VectorizableTree.front()->Scalars.size() > 2)) &&
11729 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
11730 !Inst->hasNUsesOrMore(UsesLimit) &&
11731 none_of(Inst->users(),
11732 [&](User *U) {
11733 auto *PHIUser = dyn_cast<PHINode>(U);
11734 return (!PHIUser ||
11735 PHIUser->getParent() !=
11736 cast<Instruction>(
11737 VectorizableTree.front()->getMainOp())
11738 ->getParent()) &&
11739 !getTreeEntry(U);
11740 }) &&
11741 count_if(Entry->Scalars, [&](Value *V) {
11742 return ValueToExtUses->contains(V);
11743 }) <= 2;
11744 if (IsProfitablePHIUser) {
11745 KeepScalar = true;
11746 } else if (KeepScalar && ScalarCost != TTI::TCC_Free &&
11747 ExtraCost - ScalarCost <= TTI::TCC_Basic &&
11748 (GatheredLoadsEntriesFirst == NoGatheredLoads ||
11749 Entry->Idx < GatheredLoadsEntriesFirst)) {
11750 unsigned ScalarUsesCount = count_if(Entry->Scalars, [&](Value *V) {
11751 return ValueToExtUses->contains(V);
11752 });
11753 auto It = ExtractsCount.find(Entry);
11754 if (It != ExtractsCount.end()) {
11755 assert(ScalarUsesCount >= It->getSecond().size() &&
11756 "Expected total number of external uses not less than "
11757 "number of scalar uses.");
11758 ScalarUsesCount -= It->getSecond().size();
11759 }
11760 // Keep original scalar if number of externally used instructions in
11761 // the same entry is not power of 2. It may help to do some extra
11762 // vectorization for now.
11763 KeepScalar = ScalarUsesCount <= 1 || !has_single_bit(ScalarUsesCount);
11764 }
11765 if (KeepScalar) {
11766 ExternalUsesAsOriginalScalar.insert(EU.Scalar);
11767 for_each(Inst->operands(), [&](Value *V) {
11768 auto It = ValueToExtUses->find(V);
11769 if (It != ValueToExtUses->end()) {
11770 // Replace all uses to avoid compiler crash.
11771 ExternalUses[It->second].User = nullptr;
11772 }
11773 });
11774 ExtraCost = ScalarCost;
11775 if (!IsPhiInLoop(EU))
11776 ExtractsCount[Entry].insert(Inst);
11777 }
11778 }
11779 }
11780
11781 ExtractCost += ExtraCost;
11782 }
11783 // Add reduced value cost, if resized.
11784 if (!VectorizedVals.empty()) {
11785 const TreeEntry &Root = *VectorizableTree.front();
11786 auto BWIt = MinBWs.find(&Root);
11787 if (BWIt != MinBWs.end()) {
11788 Type *DstTy = Root.Scalars.front()->getType();
11789 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy);
11790 unsigned SrcSz =
11791 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
11792 if (OriginalSz != SrcSz) {
11793 unsigned Opcode = Instruction::Trunc;
11794 if (OriginalSz > SrcSz)
11795 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
11796 Type *SrcTy = IntegerType::get(DstTy->getContext(), SrcSz);
11797 Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
11800 }
11801 }
11802 }
11803
11804 InstructionCost SpillCost = getSpillCost();
11805 Cost += SpillCost + ExtractCost;
11806 auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
11807 bool) {
11808 InstructionCost C = 0;
11809 unsigned VF = Mask.size();
11810 unsigned VecVF = TE->getVectorFactor();
11811 if (VF != VecVF &&
11812 (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); }) ||
11814 SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
11815 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
11816 OrigMask.begin());
11818 getWidenedType(TE->getMainOp()->getType(), VecVF),
11819 OrigMask);
11820 LLVM_DEBUG(
11821 dbgs() << "SLP: Adding cost " << C
11822 << " for final shuffle of insertelement external users.\n";
11823 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
11824 Cost += C;
11825 return std::make_pair(TE, true);
11826 }
11827 return std::make_pair(TE, false);
11828 };
11829 // Calculate the cost of the reshuffled vectors, if any.
11830 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
11831 Value *Base = ShuffledInserts[I].InsertElements.front()->getOperand(0);
11832 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
11833 unsigned VF = 0;
11834 auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
11836 assert((TEs.size() == 1 || TEs.size() == 2) &&
11837 "Expected exactly 1 or 2 tree entries.");
11838 if (TEs.size() == 1) {
11839 if (VF == 0)
11840 VF = TEs.front()->getVectorFactor();
11841 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
11842 if (!ShuffleVectorInst::isIdentityMask(Mask, VF) &&
11843 !all_of(enumerate(Mask), [=](const auto &Data) {
11844 return Data.value() == PoisonMaskElem ||
11845 (Data.index() < VF &&
11846 static_cast<int>(Data.index()) == Data.value());
11847 })) {
11850 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
11851 << " for final shuffle of insertelement "
11852 "external users.\n";
11853 TEs.front()->dump();
11854 dbgs() << "SLP: Current total cost = " << Cost << "\n");
11855 Cost += C;
11856 }
11857 } else {
11858 if (VF == 0) {
11859 if (TEs.front() &&
11860 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
11861 VF = TEs.front()->getVectorFactor();
11862 else
11863 VF = Mask.size();
11864 }
11865 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
11868 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
11869 << " for final shuffle of vector node and external "
11870 "insertelement users.\n";
11871 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
11872 dbgs() << "SLP: Current total cost = " << Cost << "\n");
11873 Cost += C;
11874 }
11875 VF = Mask.size();
11876 return TEs.back();
11877 };
11879 MutableArrayRef(Vector.data(), Vector.size()), Base,
11880 [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
11881 EstimateShufflesCost);
11884 ShuffledInserts[I].InsertElements.front()->getType()),
11885 DemandedElts[I],
11886 /*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput);
11887 Cost -= InsertCost;
11888 }
11889
11890 // Add the cost for reduced value resize (if required).
11891 if (ReductionBitWidth != 0) {
11892 assert(UserIgnoreList && "Expected reduction tree.");
11893 const TreeEntry &E = *VectorizableTree.front();
11894 auto It = MinBWs.find(&E);
11895 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
11896 unsigned SrcSize = It->second.first;
11897 unsigned DstSize = ReductionBitWidth;
11898 unsigned Opcode = Instruction::Trunc;
11899 if (SrcSize < DstSize)
11900 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
11901 auto *SrcVecTy =
11902 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
11903 auto *DstVecTy =
11904 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
11905 TTI::CastContextHint CCH = getCastContextHint(E);
11906 InstructionCost CastCost;
11907 switch (E.getOpcode()) {
11908 case Instruction::SExt:
11909 case Instruction::ZExt:
11910 case Instruction::Trunc: {
11911 const TreeEntry *OpTE = getOperandEntry(&E, 0);
11912 CCH = getCastContextHint(*OpTE);
11913 break;
11914 }
11915 default:
11916 break;
11917 }
11918 CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
11920 Cost += CastCost;
11921 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
11922 << " for final resize for reduction from " << SrcVecTy
11923 << " to " << DstVecTy << "\n";
11924 dbgs() << "SLP: Current total cost = " << Cost << "\n");
11925 }
11926 }
11927
11928#ifndef NDEBUG
11929 SmallString<256> Str;
11930 {
11932 OS << "SLP: Spill Cost = " << SpillCost << ".\n"
11933 << "SLP: Extract Cost = " << ExtractCost << ".\n"
11934 << "SLP: Total Cost = " << Cost << ".\n";
11935 }
11936 LLVM_DEBUG(dbgs() << Str);
11937 if (ViewSLPTree)
11938 ViewGraph(this, "SLP" + F->getName(), false, Str);
11939#endif
11940
11941 return Cost;
11942}
11943
11944/// Tries to find extractelement instructions with constant indices from fixed
11945/// vector type and gather such instructions into a bunch, which highly likely
11946/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
11947/// successful, the matched scalars are replaced by poison values in \p VL for
11948/// future analysis.
11949std::optional<TTI::ShuffleKind>
11950BoUpSLP::tryToGatherSingleRegisterExtractElements(
11952 // Scan list of gathered scalars for extractelements that can be represented
11953 // as shuffles.
11955 SmallVector<int> UndefVectorExtracts;
11956 for (int I = 0, E = VL.size(); I < E; ++I) {
11957 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
11958 if (!EI) {
11959 if (isa<UndefValue>(VL[I]))
11960 UndefVectorExtracts.push_back(I);
11961 continue;
11962 }
11963 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
11964 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
11965 continue;
11966 std::optional<unsigned> Idx = getExtractIndex(EI);
11967 // Undefined index.
11968 if (!Idx) {
11969 UndefVectorExtracts.push_back(I);
11970 continue;
11971 }
11972 if (Idx >= VecTy->getNumElements()) {
11973 UndefVectorExtracts.push_back(I);
11974 continue;
11975 }
11976 SmallBitVector ExtractMask(VecTy->getNumElements(), true);
11977 ExtractMask.reset(*Idx);
11978 if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {
11979 UndefVectorExtracts.push_back(I);
11980 continue;
11981 }
11982 VectorOpToIdx[EI->getVectorOperand()].push_back(I);
11983 }
11984 // Sort the vector operands by the maximum number of uses in extractelements.
11986 VectorOpToIdx.takeVector();
11987 stable_sort(Vectors, [](const auto &P1, const auto &P2) {
11988 return P1.second.size() > P2.second.size();
11989 });
11990 // Find the best pair of the vectors or a single vector.
11991 const int UndefSz = UndefVectorExtracts.size();
11992 unsigned SingleMax = 0;
11993 unsigned PairMax = 0;
11994 if (!Vectors.empty()) {
11995 SingleMax = Vectors.front().second.size() + UndefSz;
11996 if (Vectors.size() > 1) {
11997 auto *ItNext = std::next(Vectors.begin());
11998 PairMax = SingleMax + ItNext->second.size();
11999 }
12000 }
12001 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
12002 return std::nullopt;
12003 // Check if better to perform a shuffle of 2 vectors or just of a single
12004 // vector.
12005 SmallVector<Value *> SavedVL(VL.begin(), VL.end());
12006 SmallVector<Value *> GatheredExtracts(
12007 VL.size(), PoisonValue::get(VL.front()->getType()));
12008 if (SingleMax >= PairMax && SingleMax) {
12009 for (int Idx : Vectors.front().second)
12010 std::swap(GatheredExtracts[Idx], VL[Idx]);
12011 } else if (!Vectors.empty()) {
12012 for (unsigned Idx : {0, 1})
12013 for (int Idx : Vectors[Idx].second)
12014 std::swap(GatheredExtracts[Idx], VL[Idx]);
12015 }
12016 // Add extracts from undefs too.
12017 for (int Idx : UndefVectorExtracts)
12018 std::swap(GatheredExtracts[Idx], VL[Idx]);
12019 // Check that gather of extractelements can be represented as just a
12020 // shuffle of a single/two vectors the scalars are extracted from.
12021 std::optional<TTI::ShuffleKind> Res =
12022 isFixedVectorShuffle(GatheredExtracts, Mask);
12023 if (!Res || all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) {
12024 // TODO: try to check other subsets if possible.
12025 // Restore the original VL if attempt was not successful.
12026 copy(SavedVL, VL.begin());
12027 return std::nullopt;
12028 }
12029 // Restore unused scalars from mask, if some of the extractelements were not
12030 // selected for shuffle.
12031 for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
12032 if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) &&
12033 isa<UndefValue>(GatheredExtracts[I])) {
12034 std::swap(VL[I], GatheredExtracts[I]);
12035 continue;
12036 }
12037 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
12038 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
12039 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
12040 is_contained(UndefVectorExtracts, I))
12041 continue;
12042 }
12043 return Res;
12044}
12045
12046/// Tries to find extractelement instructions with constant indices from fixed
12047/// vector type and gather such instructions into a bunch, which highly likely
12048/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
12049/// successful, the matched scalars are replaced by poison values in \p VL for
12050/// future analysis.
12052BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
12054 unsigned NumParts) const {
12055 assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
12056 SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
12057 Mask.assign(VL.size(), PoisonMaskElem);
12058 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
12059 for (unsigned Part : seq<unsigned>(NumParts)) {
12060 // Scan list of gathered scalars for extractelements that can be represented
12061 // as shuffles.
12063 Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
12064 SmallVector<int> SubMask;
12065 std::optional<TTI::ShuffleKind> Res =
12066 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
12067 ShufflesRes[Part] = Res;
12068 copy(SubMask, std::next(Mask.begin(), Part * SliceSize));
12069 }
12070 if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {
12071 return Res.has_value();
12072 }))
12073 ShufflesRes.clear();
12074 return ShufflesRes;
12075}
12076
12077std::optional<TargetTransformInfo::ShuffleKind>
12078BoUpSLP::isGatherShuffledSingleRegisterEntry(
12079 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
12080 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) {
12081 Entries.clear();
12082 // TODO: currently checking only for Scalars in the tree entry, need to count
12083 // reused elements too for better cost estimation.
12084 const EdgeInfo &TEUseEI = TE == VectorizableTree.front().get()
12085 ? EdgeInfo(const_cast<TreeEntry *>(TE), 0)
12086 : TE->UserTreeIndices.front();
12087 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
12088 const BasicBlock *TEInsertBlock = nullptr;
12089 // Main node of PHI entries keeps the correct order of operands/incoming
12090 // blocks.
12091 if (auto *PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) {
12092 TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx);
12093 TEInsertPt = TEInsertBlock->getTerminator();
12094 } else {
12095 TEInsertBlock = TEInsertPt->getParent();
12096 }
12097 if (!DT->isReachableFromEntry(TEInsertBlock))
12098 return std::nullopt;
12099 auto *NodeUI = DT->getNode(TEInsertBlock);
12100 assert(NodeUI && "Should only process reachable instructions");
12101 SmallPtrSet<Value *, 4> GatheredScalars(VL.begin(), VL.end());
12102 auto CheckOrdering = [&](const Instruction *InsertPt) {
12103 // Argument InsertPt is an instruction where vector code for some other
12104 // tree entry (one that shares one or more scalars with TE) is going to be
12105 // generated. This lambda returns true if insertion point of vector code
12106 // for the TE dominates that point (otherwise dependency is the other way
12107 // around). The other node is not limited to be of a gather kind. Gather
12108 // nodes are not scheduled and their vector code is inserted before their
12109 // first user. If user is PHI, that is supposed to be at the end of a
12110 // predecessor block. Otherwise it is the last instruction among scalars of
12111 // the user node. So, instead of checking dependency between instructions
12112 // themselves, we check dependency between their insertion points for vector
12113 // code (since each scalar instruction ends up as a lane of a vector
12114 // instruction).
12115 const BasicBlock *InsertBlock = InsertPt->getParent();
12116 auto *NodeEUI = DT->getNode(InsertBlock);
12117 if (!NodeEUI)
12118 return false;
12119 assert((NodeUI == NodeEUI) ==
12120 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
12121 "Different nodes should have different DFS numbers");
12122 // Check the order of the gather nodes users.
12123 if (TEInsertPt->getParent() != InsertBlock &&
12124 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
12125 return false;
12126 if (TEInsertPt->getParent() == InsertBlock &&
12127 TEInsertPt->comesBefore(InsertPt))
12128 return false;
12129 return true;
12130 };
12131 // Find all tree entries used by the gathered values. If no common entries
12132 // found - not a shuffle.
12133 // Here we build a set of tree nodes for each gathered value and trying to
12134 // find the intersection between these sets. If we have at least one common
12135 // tree node for each gathered value - we have just a permutation of the
12136 // single vector. If we have 2 different sets, we're in situation where we
12137 // have a permutation of 2 input vectors.
12139 DenseMap<Value *, int> UsedValuesEntry;
12140 for (Value *V : VL) {
12141 if (isConstant(V))
12142 continue;
12143 // Build a list of tree entries where V is used.
12145 for (const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) {
12146 if (TEPtr == TE)
12147 continue;
12148 assert(any_of(TEPtr->Scalars,
12149 [&](Value *V) { return GatheredScalars.contains(V); }) &&
12150 "Must contain at least single gathered value.");
12151 assert(TEPtr->UserTreeIndices.size() == 1 &&
12152 "Expected only single user of a gather node.");
12153 const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
12154
12155 PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp());
12156 const Instruction *InsertPt =
12157 UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
12158 : &getLastInstructionInBundle(UseEI.UserTE);
12159 if (TEInsertPt == InsertPt) {
12160 // If 2 gathers are operands of the same entry (regardless of whether
12161 // user is PHI or else), compare operands indices, use the earlier one
12162 // as the base.
12163 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
12164 continue;
12165 // If the user instruction is used for some reason in different
12166 // vectorized nodes - make it depend on index.
12167 if (TEUseEI.UserTE != UseEI.UserTE &&
12168 TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
12169 continue;
12170 }
12171
12172 // Check if the user node of the TE comes after user node of TEPtr,
12173 // otherwise TEPtr depends on TE.
12174 if ((TEInsertBlock != InsertPt->getParent() ||
12175 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
12176 !CheckOrdering(InsertPt))
12177 continue;
12178 VToTEs.insert(TEPtr);
12179 }
12180 if (const TreeEntry *VTE = getTreeEntry(V)) {
12181 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst) {
12182 if (VTE->State != TreeEntry::Vectorize) {
12183 auto It = MultiNodeScalars.find(V);
12184 if (It == MultiNodeScalars.end())
12185 continue;
12186 VTE = *It->getSecond().begin();
12187 // Iterate through all vectorized nodes.
12188 auto *MIt = find_if(It->getSecond(), [](const TreeEntry *MTE) {
12189 return MTE->State == TreeEntry::Vectorize;
12190 });
12191 if (MIt == It->getSecond().end())
12192 continue;
12193 VTE = *MIt;
12194 }
12195 }
12196 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
12197 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
12198 continue;
12199 VToTEs.insert(VTE);
12200 }
12201 if (VToTEs.empty())
12202 continue;
12203 if (UsedTEs.empty()) {
12204 // The first iteration, just insert the list of nodes to vector.
12205 UsedTEs.push_back(VToTEs);
12206 UsedValuesEntry.try_emplace(V, 0);
12207 } else {
12208 // Need to check if there are any previously used tree nodes which use V.
12209 // If there are no such nodes, consider that we have another one input
12210 // vector.
12211 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
12212 unsigned Idx = 0;
12213 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
12214 // Do we have a non-empty intersection of previously listed tree entries
12215 // and tree entries using current V?
12216 set_intersect(VToTEs, Set);
12217 if (!VToTEs.empty()) {
12218 // Yes, write the new subset and continue analysis for the next
12219 // scalar.
12220 Set.swap(VToTEs);
12221 break;
12222 }
12223 VToTEs = SavedVToTEs;
12224 ++Idx;
12225 }
12226 // No non-empty intersection found - need to add a second set of possible
12227 // source vectors.
12228 if (Idx == UsedTEs.size()) {
12229 // If the number of input vectors is greater than 2 - not a permutation,
12230 // fallback to the regular gather.
12231 // TODO: support multiple reshuffled nodes.
12232 if (UsedTEs.size() == 2)
12233 continue;
12234 UsedTEs.push_back(SavedVToTEs);
12235 Idx = UsedTEs.size() - 1;
12236 }
12237 UsedValuesEntry.try_emplace(V, Idx);
12238 }
12239 }
12240
12241 if (UsedTEs.empty()) {
12242 Entries.clear();
12243 return std::nullopt;
12244 }
12245
12246 unsigned VF = 0;
12247 if (UsedTEs.size() == 1) {
12248 // Keep the order to avoid non-determinism.
12249 SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
12250 UsedTEs.front().end());
12251 sort(FirstEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
12252 return TE1->Idx < TE2->Idx;
12253 });
12254 // Try to find the perfect match in another gather node at first.
12255 auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {
12256 return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars);
12257 });
12258 if (It != FirstEntries.end() &&
12259 ((*It)->getVectorFactor() == VL.size() ||
12260 ((*It)->getVectorFactor() == TE->Scalars.size() &&
12261 TE->ReuseShuffleIndices.size() == VL.size() &&
12262 (*It)->isSame(TE->Scalars)))) {
12263 Entries.push_back(*It);
12264 if ((*It)->getVectorFactor() == VL.size()) {
12265 std::iota(std::next(Mask.begin(), Part * VL.size()),
12266 std::next(Mask.begin(), (Part + 1) * VL.size()), 0);
12267 } else {
12268 SmallVector<int> CommonMask = TE->getCommonMask();
12269 copy(CommonMask, Mask.begin());
12270 }
12271 // Clear undef scalars.
12272 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
12273 if (isa<PoisonValue>(VL[I]))
12276 }
12277 // No perfect match, just shuffle, so choose the first tree node from the
12278 // tree.
12279 Entries.push_back(FirstEntries.front());
12280 } else {
12281 // Try to find nodes with the same vector factor.
12282 assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
12283 // Keep the order of tree nodes to avoid non-determinism.
12285 for (const TreeEntry *TE : UsedTEs.front()) {
12286 unsigned VF = TE->getVectorFactor();
12287 auto It = VFToTE.find(VF);
12288 if (It != VFToTE.end()) {
12289 if (It->second->Idx > TE->Idx)
12290 It->getSecond() = TE;
12291 continue;
12292 }
12293 VFToTE.try_emplace(VF, TE);
12294 }
12295 // Same, keep the order to avoid non-determinism.
12296 SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
12297 UsedTEs.back().end());
12298 sort(SecondEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
12299 return TE1->Idx < TE2->Idx;
12300 });
12301 for (const TreeEntry *TE : SecondEntries) {
12302 auto It = VFToTE.find(TE->getVectorFactor());
12303 if (It != VFToTE.end()) {
12304 VF = It->first;
12305 Entries.push_back(It->second);
12306 Entries.push_back(TE);
12307 break;
12308 }
12309 }
12310 // No 2 source vectors with the same vector factor - just choose 2 with max
12311 // index.
12312 if (Entries.empty()) {
12313 Entries.push_back(*llvm::max_element(
12314 UsedTEs.front(), [](const TreeEntry *TE1, const TreeEntry *TE2) {
12315 return TE1->Idx < TE2->Idx;
12316 }));
12317 Entries.push_back(SecondEntries.front());
12318 VF = std::max(Entries.front()->getVectorFactor(),
12319 Entries.back()->getVectorFactor());
12320 }
12321 }
12322
12323 bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, IsaPred<UndefValue>);
12324 // Checks if the 2 PHIs are compatible in terms of high possibility to be
12325 // vectorized.
12326 auto AreCompatiblePHIs = [&](Value *V, Value *V1) {
12327 auto *PHI = cast<PHINode>(V);
12328 auto *PHI1 = cast<PHINode>(V1);
12329 // Check that all incoming values are compatible/from same parent (if they
12330 // are instructions).
12331 // The incoming values are compatible if they all are constants, or
12332 // instruction with the same/alternate opcodes from the same basic block.
12333 for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
12334 Value *In = PHI->getIncomingValue(I);
12335 Value *In1 = PHI1->getIncomingValue(I);
12336 if (isConstant(In) && isConstant(In1))
12337 continue;
12338 if (!getSameOpcode({In, In1}, *TLI).getOpcode())
12339 return false;
12340 if (cast<Instruction>(In)->getParent() !=
12342 return false;
12343 }
12344 return true;
12345 };
12346 // Check if the value can be ignored during analysis for shuffled gathers.
12347 // We suppose it is better to ignore instruction, which do not form splats,
12348 // are not vectorized/not extractelements (these instructions will be handled
12349 // by extractelements processing) or may form vector node in future.
12350 auto MightBeIgnored = [=](Value *V) {
12351 auto *I = dyn_cast<Instruction>(V);
12352 return I && !IsSplatOrUndefs && !ScalarToTreeEntry.count(I) &&
12354 !areAllUsersVectorized(I, UserIgnoreList) && isSimple(I);
12355 };
12356 // Check that the neighbor instruction may form a full vector node with the
12357 // current instruction V. It is possible, if they have same/alternate opcode
12358 // and same parent basic block.
12359 auto NeighborMightBeIgnored = [&](Value *V, int Idx) {
12360 Value *V1 = VL[Idx];
12361 bool UsedInSameVTE = false;
12362 auto It = UsedValuesEntry.find(V1);
12363 if (It != UsedValuesEntry.end())
12364 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
12365 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
12366 getSameOpcode({V, V1}, *TLI).getOpcode() &&
12367 cast<Instruction>(V)->getParent() ==
12368 cast<Instruction>(V1)->getParent() &&
12369 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
12370 };
12371 // Build a shuffle mask for better cost estimation and vector emission.
12372 SmallBitVector UsedIdxs(Entries.size());
12374 for (int I = 0, E = VL.size(); I < E; ++I) {
12375 Value *V = VL[I];
12376 auto It = UsedValuesEntry.find(V);
12377 if (It == UsedValuesEntry.end())
12378 continue;
12379 // Do not try to shuffle scalars, if they are constants, or instructions
12380 // that can be vectorized as a result of the following vector build
12381 // vectorization.
12382 if (isConstant(V) || (MightBeIgnored(V) &&
12383 ((I > 0 && NeighborMightBeIgnored(V, I - 1)) ||
12384 (I != E - 1 && NeighborMightBeIgnored(V, I + 1)))))
12385 continue;
12386 unsigned Idx = It->second;
12387 EntryLanes.emplace_back(Idx, I);
12388 UsedIdxs.set(Idx);
12389 }
12390 // Iterate through all shuffled scalars and select entries, which can be used
12391 // for final shuffle.
12393 for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) {
12394 if (!UsedIdxs.test(I))
12395 continue;
12396 // Fix the entry number for the given scalar. If it is the first entry, set
12397 // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
12398 // These indices are used when calculating final shuffle mask as the vector
12399 // offset.
12400 for (std::pair<unsigned, int> &Pair : EntryLanes)
12401 if (Pair.first == I)
12402 Pair.first = TempEntries.size();
12403 TempEntries.push_back(Entries[I]);
12404 }
12405 Entries.swap(TempEntries);
12406 if (EntryLanes.size() == Entries.size() &&
12407 !VL.equals(ArrayRef(TE->Scalars)
12408 .slice(Part * VL.size(),
12409 std::min<int>(VL.size(), TE->Scalars.size())))) {
12410 // We may have here 1 or 2 entries only. If the number of scalars is equal
12411 // to the number of entries, no need to do the analysis, it is not very
12412 // profitable. Since VL is not the same as TE->Scalars, it means we already
12413 // have some shuffles before. Cut off not profitable case.
12414 Entries.clear();
12415 return std::nullopt;
12416 }
12417 // Build the final mask, check for the identity shuffle, if possible.
12418 bool IsIdentity = Entries.size() == 1;
12419 // Pair.first is the offset to the vector, while Pair.second is the index of
12420 // scalar in the list.
12421 for (const std::pair<unsigned, int> &Pair : EntryLanes) {
12422 unsigned Idx = Part * VL.size() + Pair.second;
12423 Mask[Idx] =
12424 Pair.first * VF +
12425 (ForOrder ? std::distance(
12426 Entries[Pair.first]->Scalars.begin(),
12427 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
12428 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
12429 IsIdentity &= Mask[Idx] == Pair.second;
12430 }
12431 switch (Entries.size()) {
12432 case 1:
12433 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
12435 break;
12436 case 2:
12437 if (EntryLanes.size() > 2 || VL.size() <= 2)
12439 break;
12440 default:
12441 break;
12442 }
12443 Entries.clear();
12444 // Clear the corresponding mask elements.
12445 std::fill(std::next(Mask.begin(), Part * VL.size()),
12446 std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem);
12447 return std::nullopt;
12448}
12449
12451BoUpSLP::isGatherShuffledEntry(
12452 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
12453 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts,
12454 bool ForOrder) {
12455 assert(NumParts > 0 && NumParts < VL.size() &&
12456 "Expected positive number of registers.");
12457 Entries.clear();
12458 // No need to check for the topmost gather node.
12459 if (TE == VectorizableTree.front().get() &&
12460 (GatheredLoadsEntriesFirst == NoGatheredLoads ||
12461 none_of(ArrayRef(VectorizableTree).drop_front(),
12462 [](const std::unique_ptr<TreeEntry> &TE) {
12463 return !TE->isGather();
12464 })))
12465 return {};
12466 // FIXME: Gathering for non-power-of-2 nodes not implemented yet.
12467 if (TE->isNonPowOf2Vec())
12468 return {};
12469 Mask.assign(VL.size(), PoisonMaskElem);
12470 assert((TE->UserTreeIndices.size() == 1 ||
12471 TE == VectorizableTree.front().get()) &&
12472 "Expected only single user of the gather node.");
12473 assert(VL.size() % NumParts == 0 &&
12474 "Number of scalars must be divisible by NumParts.");
12475 if (!TE->UserTreeIndices.empty() &&
12476 TE->UserTreeIndices.front().UserTE->isGather() &&
12477 TE->UserTreeIndices.front().EdgeIdx == UINT_MAX) {
12478 assert((TE->Idx == 0 || TE->getOpcode() == Instruction::ExtractElement ||
12479 isSplat(TE->Scalars)) &&
12480 "Expected splat or extractelements only node.");
12481 return {};
12482 }
12483 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
12485 for (unsigned Part : seq<unsigned>(NumParts)) {
12486 ArrayRef<Value *> SubVL =
12487 VL.slice(Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
12488 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
12489 std::optional<TTI::ShuffleKind> SubRes =
12490 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
12491 ForOrder);
12492 if (!SubRes)
12493 SubEntries.clear();
12494 Res.push_back(SubRes);
12495 if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&
12496 SubEntries.front()->getVectorFactor() == VL.size() &&
12497 (SubEntries.front()->isSame(TE->Scalars) ||
12498 SubEntries.front()->isSame(VL))) {
12499 SmallVector<const TreeEntry *> LocalSubEntries;
12500 LocalSubEntries.swap(SubEntries);
12501 Entries.clear();
12502 Res.clear();
12503 std::iota(Mask.begin(), Mask.end(), 0);
12504 // Clear undef scalars.
12505 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
12506 if (isa<PoisonValue>(VL[I]))
12508 Entries.emplace_back(1, LocalSubEntries.front());
12510 return Res;
12511 }
12512 }
12513 if (all_of(Res,
12514 [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
12515 Entries.clear();
12516 return {};
12517 }
12518 return Res;
12519}
12520
12521InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
12522 Type *ScalarTy) const {
12523 auto *VecTy = getWidenedType(ScalarTy, VL.size());
12524 bool DuplicateNonConst = false;
12525 // Find the cost of inserting/extracting values from the vector.
12526 // Check if the same elements are inserted several times and count them as
12527 // shuffle candidates.
12528 APInt ShuffledElements = APInt::getZero(VL.size());
12529 DenseMap<Value *, unsigned> UniqueElements;
12532 auto EstimateInsertCost = [&](unsigned I, Value *V) {
12533 if (V->getType() != ScalarTy) {
12534 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy, V->getType(),
12536 V = nullptr;
12537 }
12538 if (!ForPoisonSrc)
12539 Cost +=
12540 TTI->getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind,
12541 I, Constant::getNullValue(VecTy), V);
12542 };
12543 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
12544 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
12545 Value *V = VL[I];
12546 // No need to shuffle duplicates for constants.
12547 if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V)) {
12548 ShuffledElements.setBit(I);
12549 ShuffleMask[I] = isa<PoisonValue>(V) ? PoisonMaskElem : I;
12550 continue;
12551 }
12552
12553 auto Res = UniqueElements.try_emplace(V, I);
12554 if (Res.second) {
12555 EstimateInsertCost(I, V);
12556 ShuffleMask[I] = I;
12557 continue;
12558 }
12559
12560 DuplicateNonConst = true;
12561 ShuffledElements.setBit(I);
12562 ShuffleMask[I] = Res.first->second;
12563 }
12564 if (ForPoisonSrc) {
12565 if (isa<FixedVectorType>(ScalarTy)) {
12566 assert(SLPReVec && "Only supported by REVEC.");
12567 // We don't need to insert elements one by one. Instead, we can insert the
12568 // entire vector into the destination.
12569 Cost = 0;
12570 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
12571 for (unsigned I : seq<unsigned>(VL.size()))
12572 if (!ShuffledElements[I])
12573 Cost += TTI->getShuffleCost(
12574 TTI::SK_InsertSubvector, VecTy, std::nullopt, CostKind,
12575 I * ScalarTyNumElements, cast<FixedVectorType>(ScalarTy));
12576 } else {
12577 Cost = TTI->getScalarizationOverhead(VecTy, ~ShuffledElements,
12578 /*Insert*/ true,
12579 /*Extract*/ false, CostKind);
12580 }
12581 }
12582 if (DuplicateNonConst)
12584 VecTy, ShuffleMask);
12585 return Cost;
12586}
12587
12588// Perform operand reordering on the instructions in VL and return the reordered
12589// operands in Left and Right.
12590void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
12593 const BoUpSLP &R) {
12594 if (VL.empty())
12595 return;
12596 VLOperands Ops(VL, R);
12597 // Reorder the operands in place.
12598 Ops.reorder();
12599 Left = Ops.getVL(0);
12600 Right = Ops.getVL(1);
12601}
12602
12603Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
12604 auto &Res = EntryToLastInstruction.try_emplace(E).first->second;
12605 if (Res)
12606 return *Res;
12607 // Get the basic block this bundle is in. All instructions in the bundle
12608 // should be in this block (except for extractelement-like instructions with
12609 // constant indices or gathered loads).
12610 auto *Front = E->getMainOp();
12611 auto *BB = Front->getParent();
12612 assert(((GatheredLoadsEntriesFirst != NoGatheredLoads &&
12613 E->getOpcode() == Instruction::Load && E->isGather() &&
12614 E->Idx < GatheredLoadsEntriesFirst) ||
12615 all_of(E->Scalars,
12616 [=](Value *V) -> bool {
12617 if (E->getOpcode() == Instruction::GetElementPtr &&
12618 !isa<GetElementPtrInst>(V))
12619 return true;
12620 auto *I = cast<Instruction>(V);
12621 return !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
12622 isVectorLikeInstWithConstOps(I);
12623 })) &&
12624 "Expected gathered loads or GEPs or instructions from same basic "
12625 "block.");
12626
12627 auto FindLastInst = [&]() {
12628 Instruction *LastInst = Front;
12629 for (Value *V : E->Scalars) {
12630 auto *I = dyn_cast<Instruction>(V);
12631 if (!I)
12632 continue;
12633 if (LastInst->getParent() == I->getParent()) {
12634 if (LastInst->comesBefore(I))
12635 LastInst = I;
12636 continue;
12637 }
12638 assert(((E->getOpcode() == Instruction::GetElementPtr &&
12640 (isVectorLikeInstWithConstOps(LastInst) &&
12642 (GatheredLoadsEntriesFirst != NoGatheredLoads &&
12643 E->getOpcode() == Instruction::Load && E->isGather() &&
12644 E->Idx < GatheredLoadsEntriesFirst)) &&
12645 "Expected vector-like or non-GEP in GEP node insts only.");
12646 if (!DT->isReachableFromEntry(LastInst->getParent())) {
12647 LastInst = I;
12648 continue;
12649 }
12650 if (!DT->isReachableFromEntry(I->getParent()))
12651 continue;
12652 auto *NodeA = DT->getNode(LastInst->getParent());
12653 auto *NodeB = DT->getNode(I->getParent());
12654 assert(NodeA && "Should only process reachable instructions");
12655 assert(NodeB && "Should only process reachable instructions");
12656 assert((NodeA == NodeB) ==
12657 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
12658 "Different nodes should have different DFS numbers");
12659 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
12660 LastInst = I;
12661 }
12662 BB = LastInst->getParent();
12663 return LastInst;
12664 };
12665
12666 auto FindFirstInst = [&]() {
12667 Instruction *FirstInst = Front;
12668 for (Value *V : E->Scalars) {
12669 auto *I = dyn_cast<Instruction>(V);
12670 if (!I)
12671 continue;
12672 if (FirstInst->getParent() == I->getParent()) {
12673 if (I->comesBefore(FirstInst))
12674 FirstInst = I;
12675 continue;
12676 }
12677 assert(((E->getOpcode() == Instruction::GetElementPtr &&
12679 (isVectorLikeInstWithConstOps(FirstInst) &&
12681 "Expected vector-like or non-GEP in GEP node insts only.");
12682 if (!DT->isReachableFromEntry(FirstInst->getParent())) {
12683 FirstInst = I;
12684 continue;
12685 }
12686 if (!DT->isReachableFromEntry(I->getParent()))
12687 continue;
12688 auto *NodeA = DT->getNode(FirstInst->getParent());
12689 auto *NodeB = DT->getNode(I->getParent());
12690 assert(NodeA && "Should only process reachable instructions");
12691 assert(NodeB && "Should only process reachable instructions");
12692 assert((NodeA == NodeB) ==
12693 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
12694 "Different nodes should have different DFS numbers");
12695 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
12696 FirstInst = I;
12697 }
12698 return FirstInst;
12699 };
12700
12701 // Set insertpoint for gathered loads to the very first load.
12702 if (GatheredLoadsEntriesFirst != NoGatheredLoads &&
12703 E->Idx >= GatheredLoadsEntriesFirst && !E->isGather() &&
12704 E->getOpcode() == Instruction::Load) {
12705 Res = FindFirstInst();
12706 return *Res;
12707 }
12708
12709 // Set the insert point to the beginning of the basic block if the entry
12710 // should not be scheduled.
12711 if (doesNotNeedToSchedule(E->Scalars) ||
12712 (!E->isGather() && all_of(E->Scalars, isVectorLikeInstWithConstOps))) {
12713 if ((E->getOpcode() == Instruction::GetElementPtr &&
12714 any_of(E->Scalars,
12715 [](Value *V) {
12716 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
12717 })) ||
12718 all_of(E->Scalars,
12719 [](Value *V) {
12720 return !isVectorLikeInstWithConstOps(V) &&
12721 isUsedOutsideBlock(V);
12722 }) ||
12723 (E->isGather() && E->Idx == 0 && all_of(E->Scalars, [](Value *V) {
12724 return isa<ExtractElementInst, UndefValue>(V) ||
12725 areAllOperandsNonInsts(V);
12726 })))
12727 Res = FindLastInst();
12728 else
12729 Res = FindFirstInst();
12730 return *Res;
12731 }
12732
12733 // Find the last instruction. The common case should be that BB has been
12734 // scheduled, and the last instruction is VL.back(). So we start with
12735 // VL.back() and iterate over schedule data until we reach the end of the
12736 // bundle. The end of the bundle is marked by null ScheduleData.
12737 if (BlocksSchedules.count(BB)) {
12738 Value *V = E->isOneOf(E->Scalars.back());
12740 V = *find_if_not(E->Scalars, doesNotNeedToBeScheduled);
12741 auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
12742 if (Bundle && Bundle->isPartOfBundle())
12743 for (; Bundle; Bundle = Bundle->NextInBundle)
12744 Res = Bundle->Inst;
12745 }
12746
12747 // LastInst can still be null at this point if there's either not an entry
12748 // for BB in BlocksSchedules or there's no ScheduleData available for
12749 // VL.back(). This can be the case if buildTree_rec aborts for various
12750 // reasons (e.g., the maximum recursion depth is reached, the maximum region
12751 // size is reached, etc.). ScheduleData is initialized in the scheduling
12752 // "dry-run".
12753 //
12754 // If this happens, we can still find the last instruction by brute force. We
12755 // iterate forwards from Front (inclusive) until we either see all
12756 // instructions in the bundle or reach the end of the block. If Front is the
12757 // last instruction in program order, LastInst will be set to Front, and we
12758 // will visit all the remaining instructions in the block.
12759 //
12760 // One of the reasons we exit early from buildTree_rec is to place an upper
12761 // bound on compile-time. Thus, taking an additional compile-time hit here is
12762 // not ideal. However, this should be exceedingly rare since it requires that
12763 // we both exit early from buildTree_rec and that the bundle be out-of-order
12764 // (causing us to iterate all the way to the end of the block).
12765 if (!Res)
12766 Res = FindLastInst();
12767 assert(Res && "Failed to find last instruction in bundle");
12768 return *Res;
12769}
12770
12771void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
12772 auto *Front = E->getMainOp();
12773 Instruction *LastInst = &getLastInstructionInBundle(E);
12774 assert(LastInst && "Failed to find last instruction in bundle");
12775 BasicBlock::iterator LastInstIt = LastInst->getIterator();
12776 // If the instruction is PHI, set the insert point after all the PHIs.
12777 bool IsPHI = isa<PHINode>(LastInst);
12778 if (IsPHI)
12779 LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
12780 if (IsPHI || (!E->isGather() && doesNotNeedToSchedule(E->Scalars))) {
12781 Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);
12782 } else {
12783 // Set the insertion point after the last instruction in the bundle. Set the
12784 // debug location to Front.
12785 Builder.SetInsertPoint(
12786 LastInst->getParent(),
12788 }
12789 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
12790}
12791
12792Value *BoUpSLP::gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy) {
12793 // List of instructions/lanes from current block and/or the blocks which are
12794 // part of the current loop. These instructions will be inserted at the end to
12795 // make it possible to optimize loops and hoist invariant instructions out of
12796 // the loops body with better chances for success.
12798 SmallSet<int, 4> PostponedIndices;
12799 Loop *L = LI->getLoopFor(Builder.GetInsertBlock());
12800 auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) {
12802 while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second)
12803 InsertBB = InsertBB->getSinglePredecessor();
12804 return InsertBB && InsertBB == InstBB;
12805 };
12806 for (int I = 0, E = VL.size(); I < E; ++I) {
12807 if (auto *Inst = dyn_cast<Instruction>(VL[I]))
12808 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
12809 getTreeEntry(Inst) ||
12810 (L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) &&
12811 PostponedIndices.insert(I).second)
12812 PostponedInsts.emplace_back(Inst, I);
12813 }
12814
12815 auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos,
12816 Type *Ty) {
12817 Value *Scalar = V;
12818 if (Scalar->getType() != Ty) {
12819 assert(Scalar->getType()->isIntOrIntVectorTy() &&
12820 Ty->isIntOrIntVectorTy() && "Expected integer types only.");
12821 Value *V = Scalar;
12822 if (auto *CI = dyn_cast<CastInst>(Scalar);
12824 Value *Op = CI->getOperand(0);
12825 if (auto *IOp = dyn_cast<Instruction>(Op);
12826 !IOp || !(isDeleted(IOp) || getTreeEntry(IOp)))
12827 V = Op;
12828 }
12829 Scalar = Builder.CreateIntCast(
12830 V, Ty, !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
12831 }
12832
12833 Instruction *InsElt;
12834 if (auto *VecTy = dyn_cast<FixedVectorType>(Scalar->getType())) {
12835 assert(SLPReVec && "FixedVectorType is not expected.");
12836 Vec = InsElt = Builder.CreateInsertVector(
12837 Vec->getType(), Vec, Scalar,
12838 Builder.getInt64(Pos * VecTy->getNumElements()));
12839 auto *II = dyn_cast<IntrinsicInst>(InsElt);
12840 if (!II || II->getIntrinsicID() != Intrinsic::vector_insert)
12841 return Vec;
12842 } else {
12843 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
12844 InsElt = dyn_cast<InsertElementInst>(Vec);
12845 if (!InsElt)
12846 return Vec;
12847 }
12848 GatherShuffleExtractSeq.insert(InsElt);
12849 CSEBlocks.insert(InsElt->getParent());
12850 // Add to our 'need-to-extract' list.
12851 if (isa<Instruction>(V)) {
12852 if (TreeEntry *Entry = getTreeEntry(V)) {
12853 // Find which lane we need to extract.
12854 User *UserOp = nullptr;
12855 if (Scalar != V) {
12856 if (auto *SI = dyn_cast<Instruction>(Scalar))
12857 UserOp = SI;
12858 } else {
12859 UserOp = InsElt;
12860 }
12861 if (UserOp) {
12862 unsigned FoundLane = Entry->findLaneForValue(V);
12863 ExternalUses.emplace_back(V, UserOp, FoundLane);
12864 }
12865 }
12866 }
12867 return Vec;
12868 };
12869 auto *VecTy = getWidenedType(ScalarTy, VL.size());
12870 Value *Vec = Root ? Root : PoisonValue::get(VecTy);
12871 SmallVector<int> NonConsts;
12872 // Insert constant values at first.
12873 for (int I = 0, E = VL.size(); I < E; ++I) {
12874 if (PostponedIndices.contains(I))
12875 continue;
12876 if (!isConstant(VL[I])) {
12877 NonConsts.push_back(I);
12878 continue;
12879 }
12880 if (Root) {
12881 if (!isa<UndefValue>(VL[I])) {
12882 NonConsts.push_back(I);
12883 continue;
12884 }
12885 if (isa<PoisonValue>(VL[I]))
12886 continue;
12887 if (auto *SV = dyn_cast<ShuffleVectorInst>(Root)) {
12888 if (SV->getMaskValue(I) == PoisonMaskElem)
12889 continue;
12890 }
12891 }
12892 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
12893 }
12894 // Insert non-constant values.
12895 for (int I : NonConsts)
12896 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
12897 // Append instructions, which are/may be part of the loop, in the end to make
12898 // it possible to hoist non-loop-based instructions.
12899 for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
12900 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
12901
12902 return Vec;
12903}
12904
12905/// Merges shuffle masks and emits final shuffle instruction, if required. It
12906/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
12907/// when the actual shuffle instruction is generated only if this is actually
12908/// required. Otherwise, the shuffle instruction emission is delayed till the
12909/// end of the process, to reduce the number of emitted instructions and further
12910/// analysis/transformations.
12911/// The class also will look through the previously emitted shuffle instructions
12912/// and properly mark indices in mask as undef.
12913/// For example, given the code
12914/// \code
12915/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
12916/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
12917/// \endcode
12918/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
12919/// look through %s1 and %s2 and emit
12920/// \code
12921/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
12922/// \endcode
12923/// instead.
12924/// If 2 operands are of different size, the smallest one will be resized and
12925/// the mask recalculated properly.
12926/// For example, given the code
12927/// \code
12928/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
12929/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
12930/// \endcode
12931/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
12932/// look through %s1 and %s2 and emit
12933/// \code
12934/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
12935/// \endcode
12936/// instead.
12937class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
12938 bool IsFinalized = false;
12939 /// Combined mask for all applied operands and masks. It is built during
12940 /// analysis and actual emission of shuffle vector instructions.
12941 SmallVector<int> CommonMask;
12942 /// List of operands for the shuffle vector instruction. It hold at max 2
12943 /// operands, if the 3rd is going to be added, the first 2 are combined into
12944 /// shuffle with \p CommonMask mask, the first operand sets to be the
12945 /// resulting shuffle and the second operand sets to be the newly added
12946 /// operand. The \p CommonMask is transformed in the proper way after that.
12947 SmallVector<Value *, 2> InVectors;
12948 IRBuilderBase &Builder;
12949 BoUpSLP &R;
12950
12951 class ShuffleIRBuilder {
12952 IRBuilderBase &Builder;
12953 /// Holds all of the instructions that we gathered.
12954 SetVector<Instruction *> &GatherShuffleExtractSeq;
12955 /// A list of blocks that we are going to CSE.
12956 DenseSet<BasicBlock *> &CSEBlocks;
12957 /// Data layout.
12958 const DataLayout &DL;
12959
12960 public:
12961 ShuffleIRBuilder(IRBuilderBase &Builder,
12962 SetVector<Instruction *> &GatherShuffleExtractSeq,
12963 DenseSet<BasicBlock *> &CSEBlocks, const DataLayout &DL)
12964 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
12965 CSEBlocks(CSEBlocks), DL(DL) {}
12966 ~ShuffleIRBuilder() = default;
12967 /// Creates shufflevector for the 2 operands with the given mask.
12968 Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) {
12969 if (V1->getType() != V2->getType()) {
12971 V1->getType()->isIntOrIntVectorTy() &&
12972 "Expected integer vector types only.");
12973 if (V1->getType() != V2->getType()) {
12974 if (cast<VectorType>(V2->getType())
12975 ->getElementType()
12976 ->getIntegerBitWidth() < cast<VectorType>(V1->getType())
12977 ->getElementType()
12978 ->getIntegerBitWidth())
12979 V2 = Builder.CreateIntCast(
12980 V2, V1->getType(), !isKnownNonNegative(V2, SimplifyQuery(DL)));
12981 else
12982 V1 = Builder.CreateIntCast(
12983 V1, V2->getType(), !isKnownNonNegative(V1, SimplifyQuery(DL)));
12984 }
12985 }
12986 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
12987 if (auto *I = dyn_cast<Instruction>(Vec)) {
12988 GatherShuffleExtractSeq.insert(I);
12989 CSEBlocks.insert(I->getParent());
12990 }
12991 return Vec;
12992 }
12993 /// Creates permutation of the single vector operand with the given mask, if
12994 /// it is not identity mask.
12995 Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) {
12996 if (Mask.empty())
12997 return V1;
12998 unsigned VF = Mask.size();
12999 unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();
13000 if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, VF))
13001 return V1;
13002 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
13003 if (auto *I = dyn_cast<Instruction>(Vec)) {
13004 GatherShuffleExtractSeq.insert(I);
13005 CSEBlocks.insert(I->getParent());
13006 }
13007 return Vec;
13008 }
13009 Value *createIdentity(Value *V) { return V; }
13010 Value *createPoison(Type *Ty, unsigned VF) {
13011 return PoisonValue::get(getWidenedType(Ty, VF));
13012 }
13013 /// Resizes 2 input vector to match the sizes, if the they are not equal
13014 /// yet. The smallest vector is resized to the size of the larger vector.
13015 void resizeToMatch(Value *&V1, Value *&V2) {
13016 if (V1->getType() == V2->getType())
13017 return;
13018 int V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();
13019 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
13020 int VF = std::max(V1VF, V2VF);
13021 int MinVF = std::min(V1VF, V2VF);
13022 SmallVector<int> IdentityMask(VF, PoisonMaskElem);
13023 std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF),
13024 0);
13025 Value *&Op = MinVF == V1VF ? V1 : V2;
13026 Op = Builder.CreateShuffleVector(Op, IdentityMask);
13027 if (auto *I = dyn_cast<Instruction>(Op)) {
13028 GatherShuffleExtractSeq.insert(I);
13029 CSEBlocks.insert(I->getParent());
13030 }
13031 if (MinVF == V1VF)
13032 V1 = Op;
13033 else
13034 V2 = Op;
13035 }
13036 };
13037
13038 /// Smart shuffle instruction emission, walks through shuffles trees and
13039 /// tries to find the best matching vector for the actual shuffle
13040 /// instruction.
13041 Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {
13042 assert(V1 && "Expected at least one vector value.");
13043 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
13044 R.CSEBlocks, *R.DL);
13045 return BaseShuffleAnalysis::createShuffle<Value *>(V1, V2, Mask,
13046 ShuffleBuilder);
13047 }
13048
13049 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
13050 /// shuffle emission.
13051 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
13052 ArrayRef<int> Mask) {
13053 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13054 if (Mask[Idx] != PoisonMaskElem)
13055 CommonMask[Idx] = Idx;
13056 }
13057
13058 /// Cast value \p V to the vector type with the same number of elements, but
13059 /// the base type \p ScalarTy.
13060 Value *castToScalarTyElem(Value *V,
13061 std::optional<bool> IsSigned = std::nullopt) {
13062 auto *VecTy = cast<VectorType>(V->getType());
13063 assert(getNumElements(VecTy) % getNumElements(ScalarTy) == 0);
13064 if (VecTy->getElementType() == ScalarTy->getScalarType())
13065 return V;
13066 return Builder.CreateIntCast(
13067 V, VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),
13068 IsSigned.value_or(!isKnownNonNegative(V, SimplifyQuery(*R.DL))));
13069 }
13070
13071public:
13073 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
13074
13075 /// Adjusts extractelements after reusing them.
13076 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
13077 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
13078 unsigned NumParts, bool &UseVecBaseAsInput) {
13079 UseVecBaseAsInput = false;
13080 SmallPtrSet<Value *, 4> UniqueBases;
13081 Value *VecBase = nullptr;
13082 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
13083 int Idx = Mask[I];
13084 if (Idx == PoisonMaskElem)
13085 continue;
13086 auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
13087 VecBase = EI->getVectorOperand();
13088 if (const TreeEntry *TE = R.getTreeEntry(VecBase))
13089 VecBase = TE->VectorizedValue;
13090 assert(VecBase && "Expected vectorized value.");
13091 UniqueBases.insert(VecBase);
13092 // If the only one use is vectorized - can delete the extractelement
13093 // itself.
13094 if (!EI->hasOneUse() || (NumParts != 1 && count(E->Scalars, EI) > 1) ||
13095 any_of(EI->users(), [&](User *U) {
13096 const TreeEntry *UTE = R.getTreeEntry(U);
13097 return !UTE || R.MultiNodeScalars.contains(U) ||
13098 (isa<GetElementPtrInst>(U) &&
13099 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
13100 count_if(R.VectorizableTree,
13101 [&](const std::unique_ptr<TreeEntry> &TE) {
13102 return any_of(TE->UserTreeIndices,
13103 [&](const EdgeInfo &Edge) {
13104 return Edge.UserTE == UTE;
13105 }) &&
13106 is_contained(TE->Scalars, EI);
13107 }) != 1;
13108 }))
13109 continue;
13110 R.eraseInstruction(EI);
13111 }
13112 if (NumParts == 1 || UniqueBases.size() == 1) {
13113 assert(VecBase && "Expected vectorized value.");
13114 return castToScalarTyElem(VecBase);
13115 }
13116 UseVecBaseAsInput = true;
13117 auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
13118 for (auto [I, Idx] : enumerate(Mask))
13119 if (Idx != PoisonMaskElem)
13120 Idx = I;
13121 };
13122 // Perform multi-register vector shuffle, joining them into a single virtual
13123 // long vector.
13124 // Need to shuffle each part independently and then insert all this parts
13125 // into a long virtual vector register, forming the original vector.
13126 Value *Vec = nullptr;
13127 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
13128 unsigned SliceSize = getPartNumElems(E->Scalars.size(), NumParts);
13129 for (unsigned Part : seq<unsigned>(NumParts)) {
13130 unsigned Limit = getNumElems(E->Scalars.size(), SliceSize, Part);
13132 ArrayRef(E->Scalars).slice(Part * SliceSize, Limit);
13133 MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
13134 constexpr int MaxBases = 2;
13135 SmallVector<Value *, MaxBases> Bases(MaxBases);
13136 auto VLMask = zip(VL, SubMask);
13137 const unsigned VF = std::accumulate(
13138 VLMask.begin(), VLMask.end(), 0U, [&](unsigned S, const auto &D) {
13139 if (std::get<1>(D) == PoisonMaskElem)
13140 return S;
13141 Value *VecOp =
13142 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
13143 if (const TreeEntry *TE = R.getTreeEntry(VecOp))
13144 VecOp = TE->VectorizedValue;
13145 assert(VecOp && "Expected vectorized value.");
13146 const unsigned Size =
13147 cast<FixedVectorType>(VecOp->getType())->getNumElements();
13148 return std::max(S, Size);
13149 });
13150 for (const auto [V, I] : VLMask) {
13151 if (I == PoisonMaskElem)
13152 continue;
13153 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
13154 if (const TreeEntry *TE = R.getTreeEntry(VecOp))
13155 VecOp = TE->VectorizedValue;
13156 assert(VecOp && "Expected vectorized value.");
13157 VecOp = castToScalarTyElem(VecOp);
13158 Bases[I / VF] = VecOp;
13159 }
13160 if (!Bases.front())
13161 continue;
13162 Value *SubVec;
13163 if (Bases.back()) {
13164 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
13165 TransformToIdentity(SubMask);
13166 } else {
13167 SubVec = Bases.front();
13168 }
13169 if (!Vec) {
13170 Vec = SubVec;
13171 assert((Part == 0 || all_of(seq<unsigned>(0, Part),
13172 [&](unsigned P) {
13173 ArrayRef<int> SubMask =
13174 Mask.slice(P * SliceSize,
13175 getNumElems(Mask.size(),
13176 SliceSize, P));
13177 return all_of(SubMask, [](int Idx) {
13178 return Idx == PoisonMaskElem;
13179 });
13180 })) &&
13181 "Expected first part or all previous parts masked.");
13182 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
13183 } else {
13184 unsigned NewVF =
13185 cast<FixedVectorType>(Vec->getType())->getNumElements();
13186 if (Vec->getType() != SubVec->getType()) {
13187 unsigned SubVecVF =
13188 cast<FixedVectorType>(SubVec->getType())->getNumElements();
13189 NewVF = std::max(NewVF, SubVecVF);
13190 }
13191 // Adjust SubMask.
13192 for (int &Idx : SubMask)
13193 if (Idx != PoisonMaskElem)
13194 Idx += NewVF;
13195 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
13196 Vec = createShuffle(Vec, SubVec, VecMask);
13197 TransformToIdentity(VecMask);
13198 }
13199 }
13200 copy(VecMask, Mask.begin());
13201 return Vec;
13202 }
13203 /// Checks if the specified entry \p E needs to be delayed because of its
13204 /// dependency nodes.
13205 std::optional<Value *>
13206 needToDelay(const TreeEntry *E,
13208 // No need to delay emission if all deps are ready.
13209 if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
13210 return all_of(
13211 TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });
13212 }))
13213 return std::nullopt;
13214 // Postpone gather emission, will be emitted after the end of the
13215 // process to keep correct order.
13216 auto *ResVecTy = getWidenedType(ScalarTy, E->getVectorFactor());
13217 return Builder.CreateAlignedLoad(
13218 ResVecTy,
13220 MaybeAlign());
13221 }
13222 /// Adds 2 input vectors (in form of tree entries) and the mask for their
13223 /// shuffling.
13224 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
13225 Value *V1 = E1.VectorizedValue;
13226 if (V1->getType()->isIntOrIntVectorTy())
13227 V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) {
13228 return !isKnownNonNegative(
13229 V, SimplifyQuery(*R.DL));
13230 }));
13231 Value *V2 = E2.VectorizedValue;
13232 if (V2->getType()->isIntOrIntVectorTy())
13233 V2 = castToScalarTyElem(V2, any_of(E2.Scalars, [&](Value *V) {
13234 return !isKnownNonNegative(
13235 V, SimplifyQuery(*R.DL));
13236 }));
13237 add(V1, V2, Mask);
13238 }
13239 /// Adds single input vector (in form of tree entry) and the mask for its
13240 /// shuffling.
13241 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
13242 Value *V1 = E1.VectorizedValue;
13243 if (V1->getType()->isIntOrIntVectorTy())
13244 V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) {
13245 return !isKnownNonNegative(
13246 V, SimplifyQuery(*R.DL));
13247 }));
13248 add(V1, Mask);
13249 }
13250 /// Adds 2 input vectors and the mask for their shuffling.
13251 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
13252 assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
13254 isa<FixedVectorType>(V2->getType()) &&
13255 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
13256 V1 = castToScalarTyElem(V1);
13257 V2 = castToScalarTyElem(V2);
13258 if (InVectors.empty()) {
13259 InVectors.push_back(V1);
13260 InVectors.push_back(V2);
13261 CommonMask.assign(Mask.begin(), Mask.end());
13262 return;
13263 }
13264 Value *Vec = InVectors.front();
13265 if (InVectors.size() == 2) {
13266 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
13267 transformMaskAfterShuffle(CommonMask, CommonMask);
13268 } else if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
13269 Mask.size()) {
13270 Vec = createShuffle(Vec, nullptr, CommonMask);
13271 transformMaskAfterShuffle(CommonMask, CommonMask);
13272 }
13273 V1 = createShuffle(V1, V2, Mask);
13274 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13275 if (Mask[Idx] != PoisonMaskElem)
13276 CommonMask[Idx] = Idx + Sz;
13277 InVectors.front() = Vec;
13278 if (InVectors.size() == 2)
13279 InVectors.back() = V1;
13280 else
13281 InVectors.push_back(V1);
13282 }
13283 /// Adds another one input vector and the mask for the shuffling.
13284 void add(Value *V1, ArrayRef<int> Mask, bool = false) {
13286 "castToScalarTyElem expects V1 to be FixedVectorType");
13287 V1 = castToScalarTyElem(V1);
13288 if (InVectors.empty()) {
13289 InVectors.push_back(V1);
13290 CommonMask.assign(Mask.begin(), Mask.end());
13291 return;
13292 }
13293 const auto *It = find(InVectors, V1);
13294 if (It == InVectors.end()) {
13295 if (InVectors.size() == 2 ||
13296 InVectors.front()->getType() != V1->getType()) {
13297 Value *V = InVectors.front();
13298 if (InVectors.size() == 2) {
13299 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
13300 transformMaskAfterShuffle(CommonMask, CommonMask);
13301 } else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
13302 CommonMask.size()) {
13303 V = createShuffle(InVectors.front(), nullptr, CommonMask);
13304 transformMaskAfterShuffle(CommonMask, CommonMask);
13305 }
13306 unsigned VF = std::max(CommonMask.size(), Mask.size());
13307 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13308 if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem)
13309 CommonMask[Idx] =
13310 V->getType() != V1->getType()
13311 ? Idx + VF
13312 : Mask[Idx] + cast<FixedVectorType>(V1->getType())
13313 ->getNumElements();
13314 if (V->getType() != V1->getType())
13315 V1 = createShuffle(V1, nullptr, Mask);
13316 InVectors.front() = V;
13317 if (InVectors.size() == 2)
13318 InVectors.back() = V1;
13319 else
13320 InVectors.push_back(V1);
13321 return;
13322 }
13323 // Check if second vector is required if the used elements are already
13324 // used from the first one.
13325 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13326 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) {
13327 InVectors.push_back(V1);
13328 break;
13329 }
13330 }
13331 int VF = getVF(V1);
13332 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13333 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
13334 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
13335 }
13336 /// Adds another one input vector and the mask for the shuffling.
13338 SmallVector<int> NewMask;
13339 inversePermutation(Order, NewMask);
13340 add(V1, NewMask);
13341 }
13342 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
13343 Value *Root = nullptr) {
13344 return R.gather(VL, Root, ScalarTy);
13345 }
13346 Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
13347 /// Finalize emission of the shuffles.
13348 /// \param Action the action (if any) to be performed before final applying of
13349 /// the \p ExtMask mask.
13350 Value *
13352 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
13353 unsigned VF = 0,
13354 function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
13355 IsFinalized = true;
13356 SmallVector<int> NewExtMask(ExtMask);
13357 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
13358 assert(SLPReVec && "FixedVectorType is not expected.");
13360 CommonMask);
13362 NewExtMask);
13363 ExtMask = NewExtMask;
13364 }
13365 if (Action) {
13366 Value *Vec = InVectors.front();
13367 if (InVectors.size() == 2) {
13368 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
13369 InVectors.pop_back();
13370 } else {
13371 Vec = createShuffle(Vec, nullptr, CommonMask);
13372 }
13373 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13374 if (CommonMask[Idx] != PoisonMaskElem)
13375 CommonMask[Idx] = Idx;
13376 assert(VF > 0 &&
13377 "Expected vector length for the final value before action.");
13378 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
13379 if (VecVF < VF) {
13380 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
13381 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
13382 Vec = createShuffle(Vec, nullptr, ResizeMask);
13383 }
13384 Action(Vec, CommonMask);
13385 InVectors.front() = Vec;
13386 }
13387 if (!SubVectors.empty()) {
13388 Value *Vec = InVectors.front();
13389 if (InVectors.size() == 2) {
13390 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
13391 InVectors.pop_back();
13392 } else {
13393 Vec = createShuffle(Vec, nullptr, CommonMask);
13394 }
13395 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
13396 if (CommonMask[Idx] != PoisonMaskElem)
13397 CommonMask[Idx] = Idx;
13398 for (auto [E, Idx] : SubVectors) {
13399 Value *V = castToScalarTyElem(E->VectorizedValue);
13400 Vec = Builder.CreateInsertVector(Vec->getType(), Vec, V,
13401 Builder.getInt64(Idx));
13402 if (!CommonMask.empty()) {
13403 std::iota(std::next(CommonMask.begin(), Idx),
13404 std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
13405 Idx);
13406 }
13407 }
13408 InVectors.front() = Vec;
13409 }
13410
13411 if (!ExtMask.empty()) {
13412 if (CommonMask.empty()) {
13413 CommonMask.assign(ExtMask.begin(), ExtMask.end());
13414 } else {
13415 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
13416 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
13417 if (ExtMask[I] == PoisonMaskElem)
13418 continue;
13419 NewMask[I] = CommonMask[ExtMask[I]];
13420 }
13421 CommonMask.swap(NewMask);
13422 }
13423 }
13424 if (CommonMask.empty()) {
13425 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
13426 return InVectors.front();
13427 }
13428 if (InVectors.size() == 2)
13429 return createShuffle(InVectors.front(), InVectors.back(), CommonMask);
13430 return createShuffle(InVectors.front(), nullptr, CommonMask);
13431 }
13432
13434 assert((IsFinalized || CommonMask.empty()) &&
13435 "Shuffle construction must be finalized.");
13436 }
13437};
13438
13439BoUpSLP::TreeEntry *BoUpSLP::getMatchedVectorizedOperand(const TreeEntry *E,
13440 unsigned NodeIdx) {
13441 ArrayRef<Value *> VL = E->getOperand(NodeIdx);
13442 InstructionsState S = getSameOpcode(VL, *TLI);
13443 // Special processing for GEPs bundle, which may include non-gep values.
13444 if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) {
13445 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
13446 if (It != VL.end())
13447 S = getSameOpcode(*It, *TLI);
13448 }
13449 if (!S.getOpcode())
13450 return nullptr;
13451 auto CheckSameVE = [&](const TreeEntry *VE) {
13452 return VE->isSame(VL) &&
13453 (any_of(VE->UserTreeIndices,
13454 [E, NodeIdx](const EdgeInfo &EI) {
13455 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
13456 }) ||
13457 any_of(VectorizableTree,
13458 [E, NodeIdx, VE](const std::unique_ptr<TreeEntry> &TE) {
13459 return TE->isOperandGatherNode(
13460 {const_cast<TreeEntry *>(E), NodeIdx}) &&
13461 VE->isSame(TE->Scalars);
13462 }));
13463 };
13464 TreeEntry *VE = getTreeEntry(S.OpValue);
13465 if (VE && CheckSameVE(VE))
13466 return VE;
13467 auto It = MultiNodeScalars.find(S.OpValue);
13468 if (It != MultiNodeScalars.end()) {
13469 auto *I = find_if(It->getSecond(), [&](const TreeEntry *TE) {
13470 return TE != VE && CheckSameVE(TE);
13471 });
13472 if (I != It->getSecond().end())
13473 return *I;
13474 }
13475 return nullptr;
13476}
13477
13478Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
13479 bool PostponedPHIs) {
13480 ValueList &VL = E->getOperand(NodeIdx);
13481 const unsigned VF = VL.size();
13482 if (TreeEntry *VE = getMatchedVectorizedOperand(E, NodeIdx)) {
13483 auto FinalShuffle = [&](Value *V, ArrayRef<int> Mask) {
13484 // V may be affected by MinBWs.
13485 // We want ShuffleInstructionBuilder to correctly support REVEC. The key
13486 // factor is the number of elements, not their type.
13487 Type *ScalarTy = cast<VectorType>(V->getType())->getElementType();
13488 unsigned NumElements = getNumElements(VL.front()->getType());
13489 ShuffleInstructionBuilder ShuffleBuilder(
13490 NumElements != 1 ? FixedVectorType::get(ScalarTy, NumElements)
13491 : ScalarTy,
13492 Builder, *this);
13493 ShuffleBuilder.add(V, Mask);
13495 E->CombinedEntriesWithIndices.size());
13496 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
13497 [&](const auto &P) {
13498 return std::make_pair(VectorizableTree[P.first].get(),
13499 P.second);
13500 });
13501 return ShuffleBuilder.finalize({}, SubVectors);
13502 };
13503 Value *V = vectorizeTree(VE, PostponedPHIs);
13504 if (VF * getNumElements(VL[0]->getType()) !=
13505 cast<FixedVectorType>(V->getType())->getNumElements()) {
13506 if (!VE->ReuseShuffleIndices.empty()) {
13507 // Reshuffle to get only unique values.
13508 // If some of the scalars are duplicated in the vectorization
13509 // tree entry, we do not vectorize them but instead generate a
13510 // mask for the reuses. But if there are several users of the
13511 // same entry, they may have different vectorization factors.
13512 // This is especially important for PHI nodes. In this case, we
13513 // need to adapt the resulting instruction for the user
13514 // vectorization factor and have to reshuffle it again to take
13515 // only unique elements of the vector. Without this code the
13516 // function incorrectly returns reduced vector instruction with
13517 // the same elements, not with the unique ones.
13518
13519 // block:
13520 // %phi = phi <2 x > { .., %entry} {%shuffle, %block}
13521 // %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0>
13522 // ... (use %2)
13523 // %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0}
13524 // br %block
13526 for (auto [I, V] : enumerate(VL)) {
13527 if (isa<PoisonValue>(V))
13528 continue;
13529 Mask[I] = VE->findLaneForValue(V);
13530 }
13531 V = FinalShuffle(V, Mask);
13532 } else {
13533 assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() &&
13534 "Expected vectorization factor less "
13535 "than original vector size.");
13536 SmallVector<int> UniformMask(VF, 0);
13537 std::iota(UniformMask.begin(), UniformMask.end(), 0);
13538 V = FinalShuffle(V, UniformMask);
13539 }
13540 }
13541 // Need to update the operand gather node, if actually the operand is not a
13542 // vectorized node, but the buildvector/gather node, which matches one of
13543 // the vectorized nodes.
13544 if (find_if(VE->UserTreeIndices, [&](const EdgeInfo &EI) {
13545 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
13546 }) == VE->UserTreeIndices.end()) {
13547 auto *It =
13548 find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
13549 return TE->isGather() && TE->UserTreeIndices.front().UserTE == E &&
13550 TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
13551 });
13552 assert(It != VectorizableTree.end() && "Expected gather node operand.");
13553 (*It)->VectorizedValue = V;
13554 }
13555 return V;
13556 }
13557
13558 // Find the corresponding gather entry and vectorize it.
13559 // Allows to be more accurate with tree/graph transformations, checks for the
13560 // correctness of the transformations in many cases.
13561 auto *I = find_if(VectorizableTree,
13562 [E, NodeIdx](const std::unique_ptr<TreeEntry> &TE) {
13563 return TE->isOperandGatherNode({E, NodeIdx});
13564 });
13565 assert(I != VectorizableTree.end() && "Gather node is not in the graph.");
13566 assert(I->get()->UserTreeIndices.size() == 1 &&
13567 "Expected only single user for the gather node.");
13568 assert(I->get()->isSame(VL) && "Expected same list of scalars.");
13569 return vectorizeTree(I->get(), PostponedPHIs);
13570}
13571
13572template <typename BVTy, typename ResTy, typename... Args>
13573ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
13574 Args &...Params) {
13575 assert(E->isGather() && "Expected gather node.");
13576 unsigned VF = E->getVectorFactor();
13577
13578 bool NeedFreeze = false;
13579 SmallVector<int> ReuseShuffleIndices(E->ReuseShuffleIndices.begin(),
13580 E->ReuseShuffleIndices.end());
13581 SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
13582 // Clear values, to be replaced by insertvector instructions.
13583 for (auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
13584 for_each(MutableArrayRef(GatheredScalars)
13585 .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),
13586 [&](Value *&V) { V = PoisonValue::get(V->getType()); });
13588 E->CombinedEntriesWithIndices.size());
13589 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
13590 [&](const auto &P) {
13591 return std::make_pair(VectorizableTree[P.first].get(), P.second);
13592 });
13593 // Build a mask out of the reorder indices and reorder scalars per this
13594 // mask.
13595 SmallVector<int> ReorderMask;
13596 inversePermutation(E->ReorderIndices, ReorderMask);
13597 if (!ReorderMask.empty())
13598 reorderScalars(GatheredScalars, ReorderMask);
13599 auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
13600 unsigned I, unsigned SliceSize) {
13601 if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {
13602 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
13603 }))
13604 return false;
13605 TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
13606 unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
13607 if (UserTE->getNumOperands() != 2)
13608 return false;
13609 auto *It =
13610 find_if(VectorizableTree, [=](const std::unique_ptr<TreeEntry> &TE) {
13611 return find_if(TE->UserTreeIndices, [=](const EdgeInfo &EI) {
13612 return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
13613 }) != TE->UserTreeIndices.end();
13614 });
13615 if (It == VectorizableTree.end())
13616 return false;
13617 int Idx;
13618 if ((Mask.size() < InputVF &&
13619 ShuffleVectorInst::isExtractSubvectorMask(Mask, InputVF, Idx) &&
13620 Idx == 0) ||
13621 (Mask.size() == InputVF &&
13622 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {
13623 std::iota(
13624 std::next(Mask.begin(), I * SliceSize),
13625 std::next(Mask.begin(),
13626 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
13627 0);
13628 } else {
13629 unsigned IVal =
13630 *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
13631 std::fill(
13632 std::next(Mask.begin(), I * SliceSize),
13633 std::next(Mask.begin(),
13634 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
13635 IVal);
13636 }
13637 return true;
13638 };
13639 BVTy ShuffleBuilder(ScalarTy, Params...);
13640 ResTy Res = ResTy();
13642 SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
13644 Value *ExtractVecBase = nullptr;
13645 bool UseVecBaseAsInput = false;
13648 Type *OrigScalarTy = GatheredScalars.front()->getType();
13649 auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size());
13650 unsigned NumParts = TTI->getNumberOfParts(VecTy);
13651 if (NumParts == 0 || NumParts >= GatheredScalars.size() ||
13652 VecTy->getNumElements() % NumParts != 0 ||
13654 VecTy->getNumElements() / NumParts))
13655 NumParts = 1;
13656 if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
13657 // Check for gathered extracts.
13658 bool Resized = false;
13659 ExtractShuffles =
13660 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
13661 if (!ExtractShuffles.empty()) {
13662 SmallVector<const TreeEntry *> ExtractEntries;
13663 for (auto [Idx, I] : enumerate(ExtractMask)) {
13664 if (I == PoisonMaskElem)
13665 continue;
13666 if (const auto *TE = getTreeEntry(
13667 cast<ExtractElementInst>(E->Scalars[Idx])->getVectorOperand()))
13668 ExtractEntries.push_back(TE);
13669 }
13670 if (std::optional<ResTy> Delayed =
13671 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
13672 // Delay emission of gathers which are not ready yet.
13673 PostponedGathers.insert(E);
13674 // Postpone gather emission, will be emitted after the end of the
13675 // process to keep correct order.
13676 return *Delayed;
13677 }
13678 if (Value *VecBase = ShuffleBuilder.adjustExtracts(
13679 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
13680 ExtractVecBase = VecBase;
13681 if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
13682 if (VF == VecBaseTy->getNumElements() &&
13683 GatheredScalars.size() != VF) {
13684 Resized = true;
13685 GatheredScalars.append(VF - GatheredScalars.size(),
13686 PoisonValue::get(OrigScalarTy));
13687 }
13688 }
13689 }
13690 // Gather extracts after we check for full matched gathers only.
13691 if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load ||
13692 ((E->getOpcode() == Instruction::Load ||
13693 any_of(E->Scalars, IsaPred<LoadInst>)) &&
13694 any_of(E->Scalars,
13695 [this](Value *V) {
13696 return isa<LoadInst>(V) && getTreeEntry(V);
13697 })) ||
13698 E->isAltShuffle() ||
13699 all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
13700 isSplat(E->Scalars) ||
13701 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
13702 GatherShuffles =
13703 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
13704 }
13705 if (!GatherShuffles.empty()) {
13706 if (std::optional<ResTy> Delayed =
13707 ShuffleBuilder.needToDelay(E, Entries)) {
13708 // Delay emission of gathers which are not ready yet.
13709 PostponedGathers.insert(E);
13710 // Postpone gather emission, will be emitted after the end of the
13711 // process to keep correct order.
13712 return *Delayed;
13713 }
13714 if (GatherShuffles.size() == 1 &&
13715 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
13716 Entries.front().front()->isSame(E->Scalars)) {
13717 // Perfect match in the graph, will reuse the previously vectorized
13718 // node. Cost is 0.
13719 LLVM_DEBUG(dbgs() << "SLP: perfect diamond match for gather bundle "
13720 << shortBundleName(E->Scalars, E->Idx) << ".\n");
13721 // Restore the mask for previous partially matched values.
13722 Mask.resize(E->Scalars.size());
13723 const TreeEntry *FrontTE = Entries.front().front();
13724 if (FrontTE->ReorderIndices.empty() &&
13725 ((FrontTE->ReuseShuffleIndices.empty() &&
13726 E->Scalars.size() == FrontTE->Scalars.size()) ||
13727 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
13728 std::iota(Mask.begin(), Mask.end(), 0);
13729 } else {
13730 for (auto [I, V] : enumerate(E->Scalars)) {
13731 if (isa<PoisonValue>(V)) {
13733 continue;
13734 }
13735 Mask[I] = FrontTE->findLaneForValue(V);
13736 }
13737 }
13738 ShuffleBuilder.add(*FrontTE, Mask);
13739 Res = ShuffleBuilder.finalize(E->getCommonMask(), SubVectors);
13740 return Res;
13741 }
13742 if (!Resized) {
13743 if (GatheredScalars.size() != VF &&
13744 any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
13745 return any_of(TEs, [&](const TreeEntry *TE) {
13746 return TE->getVectorFactor() == VF;
13747 });
13748 }))
13749 GatheredScalars.append(VF - GatheredScalars.size(),
13750 PoisonValue::get(OrigScalarTy));
13751 }
13752 // Remove shuffled elements from list of gathers.
13753 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
13754 if (Mask[I] != PoisonMaskElem)
13755 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
13756 }
13757 }
13758 }
13759 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
13760 SmallVectorImpl<int> &ReuseMask,
13761 bool IsRootPoison) {
13762 // For splats with can emit broadcasts instead of gathers, so try to find
13763 // such sequences.
13764 bool IsSplat = IsRootPoison && isSplat(Scalars) &&
13765 (Scalars.size() > 2 || Scalars.front() == Scalars.back());
13766 Scalars.append(VF - Scalars.size(), PoisonValue::get(OrigScalarTy));
13767 SmallVector<int> UndefPos;
13768 DenseMap<Value *, unsigned> UniquePositions;
13769 // Gather unique non-const values and all constant values.
13770 // For repeated values, just shuffle them.
13771 int NumNonConsts = 0;
13772 int SinglePos = 0;
13773 for (auto [I, V] : enumerate(Scalars)) {
13774 if (isa<UndefValue>(V)) {
13775 if (!isa<PoisonValue>(V)) {
13776 ReuseMask[I] = I;
13777 UndefPos.push_back(I);
13778 }
13779 continue;
13780 }
13781 if (isConstant(V)) {
13782 ReuseMask[I] = I;
13783 continue;
13784 }
13785 ++NumNonConsts;
13786 SinglePos = I;
13787 Value *OrigV = V;
13788 Scalars[I] = PoisonValue::get(OrigScalarTy);
13789 if (IsSplat) {
13790 Scalars.front() = OrigV;
13791 ReuseMask[I] = 0;
13792 } else {
13793 const auto Res = UniquePositions.try_emplace(OrigV, I);
13794 Scalars[Res.first->second] = OrigV;
13795 ReuseMask[I] = Res.first->second;
13796 }
13797 }
13798 if (NumNonConsts == 1) {
13799 // Restore single insert element.
13800 if (IsSplat) {
13801 ReuseMask.assign(VF, PoisonMaskElem);
13802 std::swap(Scalars.front(), Scalars[SinglePos]);
13803 if (!UndefPos.empty() && UndefPos.front() == 0)
13804 Scalars.front() = UndefValue::get(OrigScalarTy);
13805 }
13806 ReuseMask[SinglePos] = SinglePos;
13807 } else if (!UndefPos.empty() && IsSplat) {
13808 // For undef values, try to replace them with the simple broadcast.
13809 // We can do it if the broadcasted value is guaranteed to be
13810 // non-poisonous, or by freezing the incoming scalar value first.
13811 auto *It = find_if(Scalars, [this, E](Value *V) {
13812 return !isa<UndefValue>(V) &&
13813 (getTreeEntry(V) || isGuaranteedNotToBePoison(V) ||
13814 (E->UserTreeIndices.size() == 1 &&
13815 any_of(V->uses(), [E](const Use &U) {
13816 // Check if the value already used in the same operation in
13817 // one of the nodes already.
13818 return E->UserTreeIndices.front().EdgeIdx !=
13819 U.getOperandNo() &&
13820 is_contained(
13821 E->UserTreeIndices.front().UserTE->Scalars,
13822 U.getUser());
13823 })));
13824 });
13825 if (It != Scalars.end()) {
13826 // Replace undefs by the non-poisoned scalars and emit broadcast.
13827 int Pos = std::distance(Scalars.begin(), It);
13828 for (int I : UndefPos) {
13829 // Set the undef position to the non-poisoned scalar.
13830 ReuseMask[I] = Pos;
13831 // Replace the undef by the poison, in the mask it is replaced by
13832 // non-poisoned scalar already.
13833 if (I != Pos)
13834 Scalars[I] = PoisonValue::get(OrigScalarTy);
13835 }
13836 } else {
13837 // Replace undefs by the poisons, emit broadcast and then emit
13838 // freeze.
13839 for (int I : UndefPos) {
13840 ReuseMask[I] = PoisonMaskElem;
13841 if (isa<UndefValue>(Scalars[I]))
13842 Scalars[I] = PoisonValue::get(OrigScalarTy);
13843 }
13844 NeedFreeze = true;
13845 }
13846 }
13847 };
13848 if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
13849 bool IsNonPoisoned = true;
13850 bool IsUsedInExpr = true;
13851 Value *Vec1 = nullptr;
13852 if (!ExtractShuffles.empty()) {
13853 // Gather of extractelements can be represented as just a shuffle of
13854 // a single/two vectors the scalars are extracted from.
13855 // Find input vectors.
13856 Value *Vec2 = nullptr;
13857 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
13858 if (!Mask.empty() && Mask[I] != PoisonMaskElem)
13859 ExtractMask[I] = PoisonMaskElem;
13860 }
13861 if (UseVecBaseAsInput) {
13862 Vec1 = ExtractVecBase;
13863 } else {
13864 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
13865 if (ExtractMask[I] == PoisonMaskElem)
13866 continue;
13867 if (isa<UndefValue>(E->Scalars[I]))
13868 continue;
13869 auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
13870 Value *VecOp = EI->getVectorOperand();
13871 if (const auto *TE = getTreeEntry(VecOp))
13872 if (TE->VectorizedValue)
13873 VecOp = TE->VectorizedValue;
13874 if (!Vec1) {
13875 Vec1 = VecOp;
13876 } else if (Vec1 != VecOp) {
13877 assert((!Vec2 || Vec2 == VecOp) &&
13878 "Expected only 1 or 2 vectors shuffle.");
13879 Vec2 = VecOp;
13880 }
13881 }
13882 }
13883 if (Vec2) {
13884 IsUsedInExpr = false;
13885 IsNonPoisoned &=
13887 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
13888 } else if (Vec1) {
13889 IsUsedInExpr &= FindReusedSplat(
13890 ExtractMask,
13891 cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0,
13892 ExtractMask.size());
13893 ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);
13894 IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1);
13895 } else {
13896 IsUsedInExpr = false;
13897 ShuffleBuilder.add(PoisonValue::get(VecTy), ExtractMask,
13898 /*ForExtracts=*/true);
13899 }
13900 }
13901 if (!GatherShuffles.empty()) {
13902 unsigned SliceSize = getPartNumElems(E->Scalars.size(), NumParts);
13903 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
13904 for (const auto [I, TEs] : enumerate(Entries)) {
13905 if (TEs.empty()) {
13906 assert(!GatherShuffles[I] &&
13907 "No shuffles with empty entries list expected.");
13908 continue;
13909 }
13910 assert((TEs.size() == 1 || TEs.size() == 2) &&
13911 "Expected shuffle of 1 or 2 entries.");
13912 unsigned Limit = getNumElems(Mask.size(), SliceSize, I);
13913 auto SubMask = ArrayRef(Mask).slice(I * SliceSize, Limit);
13914 VecMask.assign(VecMask.size(), PoisonMaskElem);
13915 copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
13916 if (TEs.size() == 1) {
13917 IsUsedInExpr &= FindReusedSplat(
13918 VecMask, TEs.front()->getVectorFactor(), I, SliceSize);
13919 ShuffleBuilder.add(*TEs.front(), VecMask);
13920 if (TEs.front()->VectorizedValue)
13921 IsNonPoisoned &=
13922 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue);
13923 } else {
13924 IsUsedInExpr = false;
13925 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
13926 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
13927 IsNonPoisoned &=
13928 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue) &&
13929 isGuaranteedNotToBePoison(TEs.back()->VectorizedValue);
13930 }
13931 }
13932 }
13933 // Try to figure out best way to combine values: build a shuffle and insert
13934 // elements or just build several shuffles.
13935 // Insert non-constant scalars.
13936 SmallVector<Value *> NonConstants(GatheredScalars);
13937 int EMSz = ExtractMask.size();
13938 int MSz = Mask.size();
13939 // Try to build constant vector and shuffle with it only if currently we
13940 // have a single permutation and more than 1 scalar constants.
13941 bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
13942 bool IsIdentityShuffle =
13943 ((UseVecBaseAsInput ||
13944 all_of(ExtractShuffles,
13945 [](const std::optional<TTI::ShuffleKind> &SK) {
13946 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
13948 })) &&
13949 none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
13950 ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
13951 (!GatherShuffles.empty() &&
13952 all_of(GatherShuffles,
13953 [](const std::optional<TTI::ShuffleKind> &SK) {
13954 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
13956 }) &&
13957 none_of(Mask, [&](int I) { return I >= MSz; }) &&
13959 bool EnoughConstsForShuffle =
13960 IsSingleShuffle &&
13961 (none_of(GatheredScalars,
13962 [](Value *V) {
13963 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
13964 }) ||
13965 any_of(GatheredScalars,
13966 [](Value *V) {
13967 return isa<Constant>(V) && !isa<UndefValue>(V);
13968 })) &&
13969 (!IsIdentityShuffle ||
13970 (GatheredScalars.size() == 2 &&
13971 any_of(GatheredScalars,
13972 [](Value *V) { return !isa<UndefValue>(V); })) ||
13973 count_if(GatheredScalars, [](Value *V) {
13974 return isa<Constant>(V) && !isa<PoisonValue>(V);
13975 }) > 1);
13976 // NonConstants array contains just non-constant values, GatheredScalars
13977 // contains only constant to build final vector and then shuffle.
13978 for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
13979 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))
13980 NonConstants[I] = PoisonValue::get(OrigScalarTy);
13981 else
13982 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
13983 }
13984 // Generate constants for final shuffle and build a mask for them.
13985 if (!all_of(GatheredScalars, IsaPred<PoisonValue>)) {
13986 SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
13987 TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
13988 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
13989 ShuffleBuilder.add(BV, BVMask);
13990 }
13991 if (all_of(NonConstants, [=](Value *V) {
13992 return isa<PoisonValue>(V) ||
13993 (IsSingleShuffle && ((IsIdentityShuffle &&
13994 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
13995 }))
13996 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors);
13997 else
13998 Res = ShuffleBuilder.finalize(
13999 E->ReuseShuffleIndices, SubVectors, E->Scalars.size(),
14000 [&](Value *&Vec, SmallVectorImpl<int> &Mask) {
14001 TryPackScalars(NonConstants, Mask, /*IsRootPoison=*/false);
14002 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
14003 });
14004 } else if (!allConstant(GatheredScalars)) {
14005 // Gather unique scalars and all constants.
14006 SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
14007 TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
14008 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
14009 ShuffleBuilder.add(BV, ReuseMask);
14010 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors);
14011 } else {
14012 // Gather all constants.
14013 SmallVector<int> Mask(GatheredScalars.size(), PoisonMaskElem);
14014 for (auto [I, V] : enumerate(GatheredScalars)) {
14015 if (!isa<PoisonValue>(V))
14016 Mask[I] = I;
14017 }
14018 Value *BV = ShuffleBuilder.gather(GatheredScalars);
14019 ShuffleBuilder.add(BV, Mask);
14020 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors);
14021 }
14022
14023 if (NeedFreeze)
14024 Res = ShuffleBuilder.createFreeze(Res);
14025 return Res;
14026}
14027
14028Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy,
14029 bool PostponedPHIs) {
14030 for (auto [EIdx, _] : E->CombinedEntriesWithIndices)
14031 (void)vectorizeTree(VectorizableTree[EIdx].get(), PostponedPHIs);
14032 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
14033 Builder, *this);
14034}
14035
14036Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
14037 IRBuilderBase::InsertPointGuard Guard(Builder);
14038
14039 if (E->VectorizedValue &&
14040 (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||
14041 E->isAltShuffle())) {
14042 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
14043 return E->VectorizedValue;
14044 }
14045
14046 Value *V = E->Scalars.front();
14047 Type *ScalarTy = V->getType();
14048 if (!isa<CmpInst>(V))
14049 ScalarTy = getValueType(V);
14050 auto It = MinBWs.find(E);
14051 if (It != MinBWs.end()) {
14052 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
14053 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
14054 if (VecTy)
14055 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
14056 }
14057 auto *VecTy = getWidenedType(ScalarTy, E->Scalars.size());
14058 if (E->isGather()) {
14059 // Set insert point for non-reduction initial nodes.
14060 if (E->getMainOp() && E->Idx == 0 && !UserIgnoreList)
14061 setInsertPointAfterBundle(E);
14062 Value *Vec = createBuildVector(E, ScalarTy, PostponedPHIs);
14063 E->VectorizedValue = Vec;
14064 return Vec;
14065 }
14066
14067 bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
14068 auto FinalShuffle = [&](Value *V, const TreeEntry *E) {
14069 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
14070 if (E->getOpcode() == Instruction::Store &&
14071 E->State == TreeEntry::Vectorize) {
14073 ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
14074 E->ReorderIndices.size());
14075 ShuffleBuilder.add(V, Mask);
14076 } else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
14077 ShuffleBuilder.addOrdered(V, {});
14078 } else {
14079 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
14080 }
14082 E->CombinedEntriesWithIndices.size());
14083 transform(
14084 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) {
14085 return std::make_pair(VectorizableTree[P.first].get(), P.second);
14086 });
14087 return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors);
14088 };
14089
14090 assert(!E->isGather() && "Unhandled state");
14091 unsigned ShuffleOrOp =
14092 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
14093 Instruction *VL0 = E->getMainOp();
14094 auto GetOperandSignedness = [&](unsigned Idx) {
14095 const TreeEntry *OpE = getOperandEntry(E, Idx);
14096 bool IsSigned = false;
14097 auto It = MinBWs.find(OpE);
14098 if (It != MinBWs.end())
14099 IsSigned = It->second.second;
14100 else
14101 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
14102 return !isKnownNonNegative(R, SimplifyQuery(*DL));
14103 });
14104 return IsSigned;
14105 };
14106 switch (ShuffleOrOp) {
14107 case Instruction::PHI: {
14108 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
14109 E != VectorizableTree.front().get() ||
14110 !E->UserTreeIndices.empty()) &&
14111 "PHI reordering is free.");
14112 if (PostponedPHIs && E->VectorizedValue)
14113 return E->VectorizedValue;
14114 auto *PH = cast<PHINode>(VL0);
14115 Builder.SetInsertPoint(PH->getParent(),
14116 PH->getParent()->getFirstNonPHIIt());
14117 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
14118 if (PostponedPHIs || !E->VectorizedValue) {
14119 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
14120 E->PHI = NewPhi;
14121 Value *V = NewPhi;
14122
14123 // Adjust insertion point once all PHI's have been generated.
14124 Builder.SetInsertPoint(PH->getParent(),
14125 PH->getParent()->getFirstInsertionPt());
14126 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
14127
14128 V = FinalShuffle(V, E);
14129
14130 E->VectorizedValue = V;
14131 if (PostponedPHIs)
14132 return V;
14133 }
14134 PHINode *NewPhi = cast<PHINode>(E->PHI);
14135 // If phi node is fully emitted - exit.
14136 if (NewPhi->getNumIncomingValues() != 0)
14137 return NewPhi;
14138
14139 // PHINodes may have multiple entries from the same block. We want to
14140 // visit every block once.
14142
14143 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
14145 BasicBlock *IBB = PH->getIncomingBlock(I);
14146
14147 // Stop emission if all incoming values are generated.
14148 if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
14149 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
14150 return NewPhi;
14151 }
14152
14153 if (!VisitedBBs.insert(IBB).second) {
14154 NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
14155 continue;
14156 }
14157
14158 Builder.SetInsertPoint(IBB->getTerminator());
14159 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
14160 Value *Vec = vectorizeOperand(E, I, /*PostponedPHIs=*/true);
14161 if (VecTy != Vec->getType()) {
14162 assert((It != MinBWs.end() || getOperandEntry(E, I)->isGather() ||
14163 MinBWs.contains(getOperandEntry(E, I))) &&
14164 "Expected item in MinBWs.");
14165 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));
14166 }
14167 NewPhi->addIncoming(Vec, IBB);
14168 }
14169
14170 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
14171 "Invalid number of incoming values");
14172 assert(E->VectorizedValue && "Expected vectorized value.");
14173 return E->VectorizedValue;
14174 }
14175
14176 case Instruction::ExtractElement: {
14177 Value *V = E->getSingleOperand(0);
14178 if (const TreeEntry *TE = getTreeEntry(V))
14179 V = TE->VectorizedValue;
14180 setInsertPointAfterBundle(E);
14181 V = FinalShuffle(V, E);
14182 E->VectorizedValue = V;
14183 return V;
14184 }
14185 case Instruction::ExtractValue: {
14186 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
14187 Builder.SetInsertPoint(LI);
14188 Value *Ptr = LI->getPointerOperand();
14189 LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
14190 Value *NewV = propagateMetadata(V, E->Scalars);
14191 NewV = FinalShuffle(NewV, E);
14192 E->VectorizedValue = NewV;
14193 return NewV;
14194 }
14195 case Instruction::InsertElement: {
14196 assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
14197 Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));
14198 Value *V = vectorizeOperand(E, 1, PostponedPHIs);
14199 ArrayRef<Value *> Op = E->getOperand(1);
14200 Type *ScalarTy = Op.front()->getType();
14201 if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {
14202 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
14203 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));
14204 assert(Res.first > 0 && "Expected item in MinBWs.");
14205 V = Builder.CreateIntCast(
14206 V,
14208 ScalarTy,
14209 cast<FixedVectorType>(V->getType())->getNumElements()),
14210 Res.second);
14211 }
14212
14213 // Create InsertVector shuffle if necessary
14214 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
14215 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
14216 }));
14217 const unsigned NumElts =
14218 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
14219 const unsigned NumScalars = E->Scalars.size();
14220
14221 unsigned Offset = *getElementIndex(VL0);
14222 assert(Offset < NumElts && "Failed to find vector index offset");
14223
14224 // Create shuffle to resize vector
14226 if (!E->ReorderIndices.empty()) {
14227 inversePermutation(E->ReorderIndices, Mask);
14228 Mask.append(NumElts - NumScalars, PoisonMaskElem);
14229 } else {
14230 Mask.assign(NumElts, PoisonMaskElem);
14231 std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
14232 }
14233 // Create InsertVector shuffle if necessary
14234 bool IsIdentity = true;
14235 SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
14236 Mask.swap(PrevMask);
14237 for (unsigned I = 0; I < NumScalars; ++I) {
14238 Value *Scalar = E->Scalars[PrevMask[I]];
14239 unsigned InsertIdx = *getElementIndex(Scalar);
14240 IsIdentity &= InsertIdx - Offset == I;
14241 Mask[InsertIdx - Offset] = I;
14242 }
14243 if (!IsIdentity || NumElts != NumScalars) {
14244 Value *V2 = nullptr;
14245 bool IsVNonPoisonous = isGuaranteedNotToBePoison(V) && !isConstant(V);
14246 SmallVector<int> InsertMask(Mask);
14247 if (NumElts != NumScalars && Offset == 0) {
14248 // Follow all insert element instructions from the current buildvector
14249 // sequence.
14251 do {
14252 std::optional<unsigned> InsertIdx = getElementIndex(Ins);
14253 if (!InsertIdx)
14254 break;
14255 if (InsertMask[*InsertIdx] == PoisonMaskElem)
14256 InsertMask[*InsertIdx] = *InsertIdx;
14257 if (!Ins->hasOneUse())
14258 break;
14260 Ins->getUniqueUndroppableUser());
14261 } while (Ins);
14262 SmallBitVector UseMask =
14263 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
14264 SmallBitVector IsFirstPoison =
14265 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
14266 SmallBitVector IsFirstUndef =
14267 isUndefVector(FirstInsert->getOperand(0), UseMask);
14268 if (!IsFirstPoison.all()) {
14269 unsigned Idx = 0;
14270 for (unsigned I = 0; I < NumElts; I++) {
14271 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) &&
14272 IsFirstUndef.test(I)) {
14273 if (IsVNonPoisonous) {
14274 InsertMask[I] = I < NumScalars ? I : 0;
14275 continue;
14276 }
14277 if (!V2)
14278 V2 = UndefValue::get(V->getType());
14279 if (Idx >= NumScalars)
14280 Idx = NumScalars - 1;
14281 InsertMask[I] = NumScalars + Idx;
14282 ++Idx;
14283 } else if (InsertMask[I] != PoisonMaskElem &&
14284 Mask[I] == PoisonMaskElem) {
14285 InsertMask[I] = PoisonMaskElem;
14286 }
14287 }
14288 } else {
14289 InsertMask = Mask;
14290 }
14291 }
14292 if (!V2)
14293 V2 = PoisonValue::get(V->getType());
14294 V = Builder.CreateShuffleVector(V, V2, InsertMask);
14295 if (auto *I = dyn_cast<Instruction>(V)) {
14296 GatherShuffleExtractSeq.insert(I);
14297 CSEBlocks.insert(I->getParent());
14298 }
14299 }
14300
14301 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
14302 for (unsigned I = 0; I < NumElts; I++) {
14303 if (Mask[I] != PoisonMaskElem)
14304 InsertMask[Offset + I] = I;
14305 }
14306 SmallBitVector UseMask =
14307 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
14308 SmallBitVector IsFirstUndef =
14309 isUndefVector(FirstInsert->getOperand(0), UseMask);
14310 if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
14311 NumElts != NumScalars) {
14312 if (IsFirstUndef.all()) {
14313 if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {
14314 SmallBitVector IsFirstPoison =
14315 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
14316 if (!IsFirstPoison.all()) {
14317 for (unsigned I = 0; I < NumElts; I++) {
14318 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))
14319 InsertMask[I] = I + NumElts;
14320 }
14321 }
14322 V = Builder.CreateShuffleVector(
14323 V,
14324 IsFirstPoison.all() ? PoisonValue::get(V->getType())
14325 : FirstInsert->getOperand(0),
14326 InsertMask, cast<Instruction>(E->Scalars.back())->getName());
14327 if (auto *I = dyn_cast<Instruction>(V)) {
14328 GatherShuffleExtractSeq.insert(I);
14329 CSEBlocks.insert(I->getParent());
14330 }
14331 }
14332 } else {
14333 SmallBitVector IsFirstPoison =
14334 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
14335 for (unsigned I = 0; I < NumElts; I++) {
14336 if (InsertMask[I] == PoisonMaskElem)
14337 InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I;
14338 else
14339 InsertMask[I] += NumElts;
14340 }
14341 V = Builder.CreateShuffleVector(
14342 FirstInsert->getOperand(0), V, InsertMask,
14343 cast<Instruction>(E->Scalars.back())->getName());
14344 if (auto *I = dyn_cast<Instruction>(V)) {
14345 GatherShuffleExtractSeq.insert(I);
14346 CSEBlocks.insert(I->getParent());
14347 }
14348 }
14349 }
14350
14351 ++NumVectorInstructions;
14352 E->VectorizedValue = V;
14353 return V;
14354 }
14355 case Instruction::ZExt:
14356 case Instruction::SExt:
14357 case Instruction::FPToUI:
14358 case Instruction::FPToSI:
14359 case Instruction::FPExt:
14360 case Instruction::PtrToInt:
14361 case Instruction::IntToPtr:
14362 case Instruction::SIToFP:
14363 case Instruction::UIToFP:
14364 case Instruction::Trunc:
14365 case Instruction::FPTrunc:
14366 case Instruction::BitCast: {
14367 setInsertPointAfterBundle(E);
14368
14369 Value *InVec = vectorizeOperand(E, 0, PostponedPHIs);
14370 if (E->VectorizedValue) {
14371 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
14372 return E->VectorizedValue;
14373 }
14374
14375 auto *CI = cast<CastInst>(VL0);
14376 Instruction::CastOps VecOpcode = CI->getOpcode();
14377 Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType();
14378 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
14379 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
14380 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
14381 SrcScalarTy != CI->getOperand(0)->getType()->getScalarType())) {
14382 // Check if the values are candidates to demote.
14383 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
14384 if (SrcIt != MinBWs.end())
14385 SrcBWSz = SrcIt->second.first;
14386 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
14387 if (BWSz == SrcBWSz) {
14388 VecOpcode = Instruction::BitCast;
14389 } else if (BWSz < SrcBWSz) {
14390 VecOpcode = Instruction::Trunc;
14391 } else if (It != MinBWs.end()) {
14392 assert(BWSz > SrcBWSz && "Invalid cast!");
14393 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
14394 } else if (SrcIt != MinBWs.end()) {
14395 assert(BWSz > SrcBWSz && "Invalid cast!");
14396 VecOpcode =
14397 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
14398 }
14399 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
14400 !SrcIt->second.second) {
14401 VecOpcode = Instruction::UIToFP;
14402 }
14403 Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
14404 ? InVec
14405 : Builder.CreateCast(VecOpcode, InVec, VecTy);
14406 V = FinalShuffle(V, E);
14407
14408 E->VectorizedValue = V;
14409 ++NumVectorInstructions;
14410 return V;
14411 }
14412 case Instruction::FCmp:
14413 case Instruction::ICmp: {
14414 setInsertPointAfterBundle(E);
14415
14416 Value *L = vectorizeOperand(E, 0, PostponedPHIs);
14417 if (E->VectorizedValue) {
14418 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
14419 return E->VectorizedValue;
14420 }
14421 Value *R = vectorizeOperand(E, 1, PostponedPHIs);
14422 if (E->VectorizedValue) {
14423 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
14424 return E->VectorizedValue;
14425 }
14426 if (L->getType() != R->getType()) {
14427 assert((getOperandEntry(E, 0)->isGather() ||
14428 getOperandEntry(E, 1)->isGather() ||
14429 MinBWs.contains(getOperandEntry(E, 0)) ||
14430 MinBWs.contains(getOperandEntry(E, 1))) &&
14431 "Expected item in MinBWs.");
14432 if (cast<VectorType>(L->getType())
14433 ->getElementType()
14434 ->getIntegerBitWidth() < cast<VectorType>(R->getType())
14435 ->getElementType()
14436 ->getIntegerBitWidth()) {
14437 Type *CastTy = R->getType();
14438 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
14439 } else {
14440 Type *CastTy = L->getType();
14441 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
14442 }
14443 }
14444
14445 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
14446 Value *V = Builder.CreateCmp(P0, L, R);
14447 propagateIRFlags(V, E->Scalars, VL0);
14448 // Do not cast for cmps.
14449 VecTy = cast<FixedVectorType>(V->getType());
14450 V = FinalShuffle(V, E);
14451
14452 E->VectorizedValue = V;
14453 ++NumVectorInstructions;
14454 return V;
14455 }
14456 case Instruction::Select: {
14457 setInsertPointAfterBundle(E);
14458
14459 Value *Cond = vectorizeOperand(E, 0, PostponedPHIs);
14460 if (E->VectorizedValue) {
14461 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
14462 return E->VectorizedValue;
14463 }
14464 Value *True = vectorizeOperand(E, 1, PostponedPHIs);
14465 if (E->VectorizedValue) {
14466 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
14467 return E->VectorizedValue;
14468 }
14469 Value *False = vectorizeOperand(E, 2, PostponedPHIs);
14470 if (E->VectorizedValue) {
14471 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
14472 return E->VectorizedValue;
14473 }
14474 if (True->getType() != VecTy || False->getType() != VecTy) {
14475 assert((It != MinBWs.end() || getOperandEntry(E, 1)->isGather() ||
14476 getOperandEntry(E, 2)->isGather() ||
14477 MinBWs.contains(getOperandEntry(E, 1)) ||
14478 MinBWs.contains(getOperandEntry(E, 2))) &&
14479 "Expected item in MinBWs.");
14480 if (True->getType() != VecTy)
14481 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
14482 if (False->getType() != VecTy)
14483 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
14484 }
14485
14486 unsigned CondNumElements = getNumElements(Cond->getType());
14487 unsigned TrueNumElements = getNumElements(True->getType());
14488 assert(TrueNumElements >= CondNumElements &&
14489 TrueNumElements % CondNumElements == 0 &&
14490 "Cannot vectorize Instruction::Select");
14491 assert(TrueNumElements == getNumElements(False->getType()) &&
14492 "Cannot vectorize Instruction::Select");
14493 if (CondNumElements != TrueNumElements) {
14494 // When the return type is i1 but the source is fixed vector type, we
14495 // need to duplicate the condition value.
14496 Cond = Builder.CreateShuffleVector(
14497 Cond, createReplicatedMask(TrueNumElements / CondNumElements,
14498 CondNumElements));
14499 }
14500 assert(getNumElements(Cond->getType()) == TrueNumElements &&
14501 "Cannot vectorize Instruction::Select");
14502 Value *V = Builder.CreateSelect(Cond, True, False);
14503 V = FinalShuffle(V, E);
14504
14505 E->VectorizedValue = V;
14506 ++NumVectorInstructions;
14507 return V;
14508 }
14509 case Instruction::FNeg: {
14510 setInsertPointAfterBundle(E);
14511
14512 Value *Op = vectorizeOperand(E, 0, PostponedPHIs);
14513
14514 if (E->VectorizedValue) {
14515 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
14516 return E->VectorizedValue;
14517 }
14518
14519 Value *V = Builder.CreateUnOp(
14520 static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
14521 propagateIRFlags(V, E->Scalars, VL0);
14522 if (auto *I = dyn_cast<Instruction>(V))
14523 V = propagateMetadata(I, E->Scalars);
14524
14525 V = FinalShuffle(V, E);
14526
14527 E->VectorizedValue = V;
14528 ++NumVectorInstructions;
14529
14530 return V;
14531 }
14532 case Instruction::Freeze: {
14533 setInsertPointAfterBundle(E);
14534
14535 Value *Op = vectorizeOperand(E, 0, PostponedPHIs);
14536
14537 if (E->VectorizedValue) {
14538 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
14539 return E->VectorizedValue;
14540 }
14541
14542 Value *V = Builder.CreateFreeze(Op);
14543 V = FinalShuffle(V, E);
14544
14545 E->VectorizedValue = V;
14546 ++NumVectorInstructions;
14547
14548 return V;
14549 }
14550 case Instruction::Add:
14551 case Instruction::FAdd:
14552 case Instruction::Sub:
14553 case Instruction::FSub:
14554 case Instruction::Mul:
14555 case Instruction::FMul:
14556 case Instruction::UDiv:
14557 case Instruction::SDiv:
14558 case Instruction::FDiv:
14559 case Instruction::URem:
14560 case Instruction::SRem:
14561 case Instruction::FRem:
14562 case Instruction::Shl:
14563 case Instruction::LShr:
14564 case Instruction::AShr:
14565 case Instruction::And:
14566 case Instruction::Or:
14567 case Instruction::Xor: {
14568 setInsertPointAfterBundle(E);
14569
14570 Value *LHS = vectorizeOperand(E, 0, PostponedPHIs);
14571 if (E->VectorizedValue) {
14572 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
14573 return E->VectorizedValue;
14574 }
14575 Value *RHS = vectorizeOperand(E, 1, PostponedPHIs);
14576 if (E->VectorizedValue) {
14577 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
14578 return E->VectorizedValue;
14579 }
14580 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
14581 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
14582 ArrayRef<Value *> Ops = E->getOperand(I);
14583 if (all_of(Ops, [&](Value *Op) {
14584 auto *CI = dyn_cast<ConstantInt>(Op);
14585 return CI && CI->getValue().countr_one() >= It->second.first;
14586 })) {
14587 V = FinalShuffle(I == 0 ? RHS : LHS, E);
14588 E->VectorizedValue = V;
14589 ++NumVectorInstructions;
14590 return V;
14591 }
14592 }
14593 }
14594 if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
14595 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
14596 getOperandEntry(E, 1)->isGather() ||
14597 MinBWs.contains(getOperandEntry(E, 0)) ||
14598 MinBWs.contains(getOperandEntry(E, 1))) &&
14599 "Expected item in MinBWs.");
14600 if (LHS->getType() != VecTy)
14601 LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0));
14602 if (RHS->getType() != VecTy)
14603 RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1));
14604 }
14605
14606 Value *V = Builder.CreateBinOp(
14607 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
14608 RHS);
14609 propagateIRFlags(V, E->Scalars, VL0, It == MinBWs.end());
14610 if (auto *I = dyn_cast<Instruction>(V)) {
14611 V = propagateMetadata(I, E->Scalars);
14612 // Drop nuw flags for abs(sub(commutative), true).
14613 if (!MinBWs.contains(E) && ShuffleOrOp == Instruction::Sub &&
14614 any_of(E->Scalars, [](Value *V) {
14615 return isCommutative(cast<Instruction>(V));
14616 }))
14617 I->setHasNoUnsignedWrap(/*b=*/false);
14618 }
14619
14620 V = FinalShuffle(V, E);
14621
14622 E->VectorizedValue = V;
14623 ++NumVectorInstructions;
14624
14625 return V;
14626 }
14627 case Instruction::Load: {
14628 // Loads are inserted at the head of the tree because we don't want to
14629 // sink them all the way down past store instructions.
14630 setInsertPointAfterBundle(E);
14631
14632 LoadInst *LI = cast<LoadInst>(VL0);
14633 Instruction *NewLI;
14634 Value *PO = LI->getPointerOperand();
14635 if (E->State == TreeEntry::Vectorize) {
14636 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
14637 } else if (E->State == TreeEntry::StridedVectorize) {
14638 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
14639 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
14640 PO = IsReverseOrder ? PtrN : Ptr0;
14641 std::optional<int> Diff = getPointersDiff(
14642 VL0->getType(), Ptr0, VL0->getType(), PtrN, *DL, *SE);
14643 Type *StrideTy = DL->getIndexType(PO->getType());
14644 Value *StrideVal;
14645 if (Diff) {
14646 int Stride = *Diff / (static_cast<int>(E->Scalars.size()) - 1);
14647 StrideVal =
14648 ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
14649 DL->getTypeAllocSize(ScalarTy));
14650 } else {
14651 SmallVector<Value *> PointerOps(E->Scalars.size(), nullptr);
14652 transform(E->Scalars, PointerOps.begin(), [](Value *V) {
14653 return cast<LoadInst>(V)->getPointerOperand();
14654 });
14655 OrdersType Order;
14656 std::optional<Value *> Stride =
14657 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order,
14658 &*Builder.GetInsertPoint());
14659 Value *NewStride =
14660 Builder.CreateIntCast(*Stride, StrideTy, /*isSigned=*/true);
14661 StrideVal = Builder.CreateMul(
14662 NewStride,
14663 ConstantInt::get(
14664 StrideTy,
14665 (IsReverseOrder ? -1 : 1) *
14666 static_cast<int>(DL->getTypeAllocSize(ScalarTy))));
14667 }
14668 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
14669 auto *Inst = Builder.CreateIntrinsic(
14670 Intrinsic::experimental_vp_strided_load,
14671 {VecTy, PO->getType(), StrideTy},
14672 {PO, StrideVal, Builder.getAllOnesMask(VecTy->getElementCount()),
14673 Builder.getInt32(E->Scalars.size())});
14674 Inst->addParamAttr(
14675 /*ArgNo=*/0,
14676 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
14677 NewLI = Inst;
14678 } else {
14679 assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
14680 Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
14681 if (E->VectorizedValue) {
14682 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
14683 return E->VectorizedValue;
14684 }
14685 if (isa<FixedVectorType>(ScalarTy)) {
14686 assert(SLPReVec && "FixedVectorType is not expected.");
14687 // CreateMaskedGather expects VecTy and VecPtr have same size. We need
14688 // to expand VecPtr if ScalarTy is a vector type.
14689 unsigned ScalarTyNumElements =
14690 cast<FixedVectorType>(ScalarTy)->getNumElements();
14691 unsigned VecTyNumElements =
14692 cast<FixedVectorType>(VecTy)->getNumElements();
14693 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
14694 "Cannot expand getelementptr.");
14695 unsigned VF = VecTyNumElements / ScalarTyNumElements;
14696 SmallVector<Constant *> Indices(VecTyNumElements);
14697 transform(seq(VecTyNumElements), Indices.begin(), [=](unsigned I) {
14698 return Builder.getInt64(I % ScalarTyNumElements);
14699 });
14700 VecPtr = Builder.CreateGEP(
14701 VecTy->getElementType(),
14702 Builder.CreateShuffleVector(
14703 VecPtr, createReplicatedMask(ScalarTyNumElements, VF)),
14704 ConstantVector::get(Indices));
14705 }
14706 // Use the minimum alignment of the gathered loads.
14707 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
14708 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
14709 }
14710 Value *V = propagateMetadata(NewLI, E->Scalars);
14711
14712 V = FinalShuffle(V, E);
14713 E->VectorizedValue = V;
14714 ++NumVectorInstructions;
14715 return V;
14716 }
14717 case Instruction::Store: {
14718 auto *SI = cast<StoreInst>(VL0);
14719
14720 setInsertPointAfterBundle(E);
14721
14722 Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs);
14723 if (VecValue->getType() != VecTy)
14724 VecValue =
14725 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
14726 VecValue = FinalShuffle(VecValue, E);
14727
14728 Value *Ptr = SI->getPointerOperand();
14729 Instruction *ST;
14730 if (E->State == TreeEntry::Vectorize) {
14731 ST = Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
14732 } else {
14733 assert(E->State == TreeEntry::StridedVectorize &&
14734 "Expected either strided or consecutive stores.");
14735 if (!E->ReorderIndices.empty()) {
14736 SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
14737 Ptr = SI->getPointerOperand();
14738 }
14739 Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
14740 Type *StrideTy = DL->getIndexType(SI->getPointerOperandType());
14741 auto *Inst = Builder.CreateIntrinsic(
14742 Intrinsic::experimental_vp_strided_store,
14743 {VecTy, Ptr->getType(), StrideTy},
14744 {VecValue, Ptr,
14745 ConstantInt::get(
14746 StrideTy, -static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
14747 Builder.getAllOnesMask(VecTy->getElementCount()),
14748 Builder.getInt32(E->Scalars.size())});
14749 Inst->addParamAttr(
14750 /*ArgNo=*/1,
14751 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
14752 ST = Inst;
14753 }
14754
14755 Value *V = propagateMetadata(ST, E->Scalars);
14756
14757 E->VectorizedValue = V;
14758 ++NumVectorInstructions;
14759 return V;
14760 }
14761 case Instruction::GetElementPtr: {
14762 auto *GEP0 = cast<GetElementPtrInst>(VL0);
14763 setInsertPointAfterBundle(E);
14764
14765 Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs);
14766 if (E->VectorizedValue) {
14767 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
14768 return E->VectorizedValue;
14769 }
14770
14771 SmallVector<Value *> OpVecs;
14772 for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
14773 Value *OpVec = vectorizeOperand(E, J, PostponedPHIs);
14774 if (E->VectorizedValue) {
14775 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
14776 return E->VectorizedValue;
14777 }
14778 OpVecs.push_back(OpVec);
14779 }
14780
14781 Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
14784 for (Value *V : E->Scalars) {
14786 GEPs.push_back(V);
14787 }
14788 V = propagateMetadata(I, GEPs);
14789 }
14790
14791 V = FinalShuffle(V, E);
14792
14793 E->VectorizedValue = V;
14794 ++NumVectorInstructions;
14795
14796 return V;
14797 }
14798 case Instruction::Call: {
14799 CallInst *CI = cast<CallInst>(VL0);
14800 setInsertPointAfterBundle(E);
14801
14803
14804 SmallVector<Type *> ArgTys =
14805 buildIntrinsicArgTypes(CI, ID, VecTy->getNumElements(),
14806 It != MinBWs.end() ? It->second.first : 0);
14807 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
14808 bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
14809 VecCallCosts.first <= VecCallCosts.second;
14810
14811 Value *ScalarArg = nullptr;
14812 SmallVector<Value *> OpVecs;
14813 SmallVector<Type *, 2> TysForDecl;
14814 // Add return type if intrinsic is overloaded on it.
14815 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1))
14816 TysForDecl.push_back(VecTy);
14817 auto *CEI = cast<CallInst>(VL0);
14818 for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
14819 ValueList OpVL;
14820 // Some intrinsics have scalar arguments. This argument should not be
14821 // vectorized.
14822 if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I)) {
14823 ScalarArg = CEI->getArgOperand(I);
14824 // if decided to reduce bitwidth of abs intrinsic, it second argument
14825 // must be set false (do not return poison, if value issigned min).
14826 if (ID == Intrinsic::abs && It != MinBWs.end() &&
14827 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
14828 ScalarArg = Builder.getFalse();
14829 OpVecs.push_back(ScalarArg);
14831 TysForDecl.push_back(ScalarArg->getType());
14832 continue;
14833 }
14834
14835 Value *OpVec = vectorizeOperand(E, I, PostponedPHIs);
14836 if (E->VectorizedValue) {
14837 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
14838 return E->VectorizedValue;
14839 }
14840 ScalarArg = CEI->getArgOperand(I);
14841 if (cast<VectorType>(OpVec->getType())->getElementType() !=
14842 ScalarArg->getType()->getScalarType() &&
14843 It == MinBWs.end()) {
14844 auto *CastTy =
14845 getWidenedType(ScalarArg->getType(), VecTy->getNumElements());
14846 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));
14847 } else if (It != MinBWs.end()) {
14848 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I));
14849 }
14850 LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
14851 OpVecs.push_back(OpVec);
14852 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I))
14853 TysForDecl.push_back(OpVec->getType());
14854 }
14855
14856 Function *CF;
14857 if (!UseIntrinsic) {
14858 VFShape Shape =
14861 static_cast<unsigned>(VecTy->getNumElements())),
14862 false /*HasGlobalPred*/);
14863 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
14864 } else {
14865 CF = Intrinsic::getDeclaration(F->getParent(), ID, TysForDecl);
14866 }
14867
14869 CI->getOperandBundlesAsDefs(OpBundles);
14870 Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
14871
14872 propagateIRFlags(V, E->Scalars, VL0);
14873 V = FinalShuffle(V, E);
14874
14875 E->VectorizedValue = V;
14876 ++NumVectorInstructions;
14877 return V;
14878 }
14879 case Instruction::ShuffleVector: {
14880 Value *V;
14881 if (SLPReVec && !E->isAltShuffle()) {
14882 assert(E->ReuseShuffleIndices.empty() &&
14883 "Not support ReuseShuffleIndices yet.");
14884 assert(E->ReorderIndices.empty() && "Not support ReorderIndices yet.");
14885 setInsertPointAfterBundle(E);
14886 Value *Src = vectorizeOperand(E, 0, PostponedPHIs);
14887 if (E->VectorizedValue) {
14888 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
14889 return E->VectorizedValue;
14890 }
14892 "Not supported shufflevector usage.");
14893 auto *SVSrc = cast<ShuffleVectorInst>(Src);
14894 assert(isa<PoisonValue>(SVSrc->getOperand(1)) &&
14895 "Not supported shufflevector usage.");
14896 SmallVector<int> ThisMask(calculateShufflevectorMask(E->Scalars));
14897 SmallVector<int> NewMask(ThisMask.size());
14898 transform(ThisMask, NewMask.begin(),
14899 [&SVSrc](int Mask) { return SVSrc->getShuffleMask()[Mask]; });
14900 V = Builder.CreateShuffleVector(SVSrc->getOperand(0), NewMask);
14901 propagateIRFlags(V, E->Scalars, VL0);
14902 } else {
14903 assert(E->isAltShuffle() &&
14904 ((Instruction::isBinaryOp(E->getOpcode()) &&
14905 Instruction::isBinaryOp(E->getAltOpcode())) ||
14906 (Instruction::isCast(E->getOpcode()) &&
14907 Instruction::isCast(E->getAltOpcode())) ||
14908 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
14909 "Invalid Shuffle Vector Operand");
14910
14911 Value *LHS = nullptr, *RHS = nullptr;
14912 if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
14913 setInsertPointAfterBundle(E);
14914 LHS = vectorizeOperand(E, 0, PostponedPHIs);
14915 if (E->VectorizedValue) {
14916 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
14917 return E->VectorizedValue;
14918 }
14919 RHS = vectorizeOperand(E, 1, PostponedPHIs);
14920 } else {
14921 setInsertPointAfterBundle(E);
14922 LHS = vectorizeOperand(E, 0, PostponedPHIs);
14923 }
14924 if (E->VectorizedValue) {
14925 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
14926 return E->VectorizedValue;
14927 }
14928 if (LHS && RHS &&
14929 ((Instruction::isBinaryOp(E->getOpcode()) &&
14930 (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
14931 (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()))) {
14932 assert((It != MinBWs.end() ||
14933 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
14934 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
14935 MinBWs.contains(getOperandEntry(E, 0)) ||
14936 MinBWs.contains(getOperandEntry(E, 1))) &&
14937 "Expected item in MinBWs.");
14938 Type *CastTy = VecTy;
14939 if (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()) {
14940 if (cast<VectorType>(LHS->getType())
14941 ->getElementType()
14942 ->getIntegerBitWidth() < cast<VectorType>(RHS->getType())
14943 ->getElementType()
14944 ->getIntegerBitWidth())
14945 CastTy = RHS->getType();
14946 else
14947 CastTy = LHS->getType();
14948 }
14949 if (LHS->getType() != CastTy)
14950 LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));
14951 if (RHS->getType() != CastTy)
14952 RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));
14953 }
14954
14955 Value *V0, *V1;
14956 if (Instruction::isBinaryOp(E->getOpcode())) {
14957 V0 = Builder.CreateBinOp(
14958 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
14959 V1 = Builder.CreateBinOp(
14960 static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
14961 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
14962 V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
14963 auto *AltCI = cast<CmpInst>(E->getAltOp());
14964 CmpInst::Predicate AltPred = AltCI->getPredicate();
14965 V1 = Builder.CreateCmp(AltPred, LHS, RHS);
14966 } else {
14967 if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
14968 unsigned SrcBWSz = DL->getTypeSizeInBits(
14969 cast<VectorType>(LHS->getType())->getElementType());
14970 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
14971 if (BWSz <= SrcBWSz) {
14972 if (BWSz < SrcBWSz)
14973 LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);
14974 assert(LHS->getType() == VecTy &&
14975 "Expected same type as operand.");
14976 if (auto *I = dyn_cast<Instruction>(LHS))
14977 LHS = propagateMetadata(I, E->Scalars);
14978 LHS = FinalShuffle(LHS, E);
14979 E->VectorizedValue = LHS;
14980 ++NumVectorInstructions;
14981 return LHS;
14982 }
14983 }
14984 V0 = Builder.CreateCast(
14985 static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
14986 V1 = Builder.CreateCast(
14987 static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
14988 }
14989 // Add V0 and V1 to later analysis to try to find and remove matching
14990 // instruction, if any.
14991 for (Value *V : {V0, V1}) {
14992 if (auto *I = dyn_cast<Instruction>(V)) {
14993 GatherShuffleExtractSeq.insert(I);
14994 CSEBlocks.insert(I->getParent());
14995 }
14996 }
14997
14998 // Create shuffle to take alternate operations from the vector.
14999 // Also, gather up main and alt scalar ops to propagate IR flags to
15000 // each vector operation.
15001 ValueList OpScalars, AltScalars;
15003 E->buildAltOpShuffleMask(
15004 [E, this](Instruction *I) {
15005 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
15006 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
15007 *TLI);
15008 },
15009 Mask, &OpScalars, &AltScalars);
15010
15011 propagateIRFlags(V0, OpScalars, E->getMainOp(), It == MinBWs.end());
15012 propagateIRFlags(V1, AltScalars, E->getAltOp(), It == MinBWs.end());
15013 auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) {
15014 // Drop nuw flags for abs(sub(commutative), true).
15015 if (auto *I = dyn_cast<Instruction>(Vec);
15016 I && Opcode == Instruction::Sub && !MinBWs.contains(E) &&
15017 any_of(E->Scalars, [](Value *V) {
15018 auto *IV = cast<Instruction>(V);
15019 return IV->getOpcode() == Instruction::Sub &&
15020 isCommutative(cast<Instruction>(IV));
15021 }))
15022 I->setHasNoUnsignedWrap(/*b=*/false);
15023 };
15024 DropNuwFlag(V0, E->getOpcode());
15025 DropNuwFlag(V1, E->getAltOpcode());
15026
15027 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
15028 assert(SLPReVec && "FixedVectorType is not expected.");
15030 }
15031 V = Builder.CreateShuffleVector(V0, V1, Mask);
15032 }
15033 if (auto *I = dyn_cast<Instruction>(V)) {
15034 V = propagateMetadata(I, E->Scalars);
15035 GatherShuffleExtractSeq.insert(I);
15036 CSEBlocks.insert(I->getParent());
15037 }
15038
15039 E->VectorizedValue = V;
15040 ++NumVectorInstructions;
15041
15042 return V;
15043 }
15044 default:
15045 llvm_unreachable("unknown inst");
15046 }
15047 return nullptr;
15048}
15049
15051 ExtraValueToDebugLocsMap ExternallyUsedValues;
15052 return vectorizeTree(ExternallyUsedValues);
15053}
15054
15055Value *
15057 Instruction *ReductionRoot) {
15058 // All blocks must be scheduled before any instructions are inserted.
15059 for (auto &BSIter : BlocksSchedules) {
15060 scheduleBlock(BSIter.second.get());
15061 }
15062 // Clean Entry-to-LastInstruction table. It can be affected after scheduling,
15063 // need to rebuild it.
15064 EntryToLastInstruction.clear();
15065
15066 if (ReductionRoot)
15067 Builder.SetInsertPoint(ReductionRoot->getParent(),
15068 ReductionRoot->getIterator());
15069 else
15070 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
15071
15072 // Emit gathered loads first to emit better code for the users of those
15073 // gathered loads.
15074 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
15075 if (GatheredLoadsEntriesFirst != NoGatheredLoads &&
15076 TE->Idx >= GatheredLoadsEntriesFirst &&
15077 (!TE->isGather() || !TE->UserTreeIndices.empty())) {
15078 assert((!TE->UserTreeIndices.empty() ||
15079 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
15080 "Expected gathered load node.");
15081 (void)vectorizeTree(TE.get(), /*PostponedPHIs=*/false);
15082 }
15083 }
15084 // Postpone emission of PHIs operands to avoid cyclic dependencies issues.
15085 (void)vectorizeTree(VectorizableTree[0].get(), /*PostponedPHIs=*/true);
15086 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
15087 if (TE->State == TreeEntry::Vectorize &&
15088 TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&
15089 TE->VectorizedValue)
15090 (void)vectorizeTree(TE.get(), /*PostponedPHIs=*/false);
15091 // Run through the list of postponed gathers and emit them, replacing the temp
15092 // emitted allocas with actual vector instructions.
15093 ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
15095 for (const TreeEntry *E : PostponedNodes) {
15096 auto *TE = const_cast<TreeEntry *>(E);
15097 if (auto *VecTE = getTreeEntry(TE->Scalars.front()))
15098 if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand(
15099 TE->UserTreeIndices.front().EdgeIdx)) &&
15100 VecTE->isSame(TE->Scalars))
15101 // Found gather node which is absolutely the same as one of the
15102 // vectorized nodes. It may happen after reordering.
15103 continue;
15104 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
15105 TE->VectorizedValue = nullptr;
15106 auto *UserI =
15107 cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);
15108 // If user is a PHI node, its vector code have to be inserted right before
15109 // block terminator. Since the node was delayed, there were some unresolved
15110 // dependencies at the moment when stab instruction was emitted. In a case
15111 // when any of these dependencies turn out an operand of another PHI, coming
15112 // from this same block, position of a stab instruction will become invalid.
15113 // The is because source vector that supposed to feed this gather node was
15114 // inserted at the end of the block [after stab instruction]. So we need
15115 // to adjust insertion point again to the end of block.
15116 if (isa<PHINode>(UserI)) {
15117 // Insert before all users.
15118 Instruction *InsertPt = PrevVec->getParent()->getTerminator();
15119 for (User *U : PrevVec->users()) {
15120 if (U == UserI)
15121 continue;
15122 auto *UI = dyn_cast<Instruction>(U);
15123 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())
15124 continue;
15125 if (UI->comesBefore(InsertPt))
15126 InsertPt = UI;
15127 }
15128 Builder.SetInsertPoint(InsertPt);
15129 } else {
15130 Builder.SetInsertPoint(PrevVec);
15131 }
15132 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
15133 Value *Vec = vectorizeTree(TE, /*PostponedPHIs=*/false);
15134 if (Vec->getType() != PrevVec->getType()) {
15135 assert(Vec->getType()->isIntOrIntVectorTy() &&
15136 PrevVec->getType()->isIntOrIntVectorTy() &&
15137 "Expected integer vector types only.");
15138 std::optional<bool> IsSigned;
15139 for (Value *V : TE->Scalars) {
15140 if (const TreeEntry *BaseTE = getTreeEntry(V)) {
15141 auto It = MinBWs.find(BaseTE);
15142 if (It != MinBWs.end()) {
15143 IsSigned = IsSigned.value_or(false) || It->second.second;
15144 if (*IsSigned)
15145 break;
15146 }
15147 for (const TreeEntry *MNTE : MultiNodeScalars.lookup(V)) {
15148 auto It = MinBWs.find(MNTE);
15149 if (It != MinBWs.end()) {
15150 IsSigned = IsSigned.value_or(false) || It->second.second;
15151 if (*IsSigned)
15152 break;
15153 }
15154 }
15155 if (IsSigned.value_or(false))
15156 break;
15157 // Scan through gather nodes.
15158 for (const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
15159 auto It = MinBWs.find(BVE);
15160 if (It != MinBWs.end()) {
15161 IsSigned = IsSigned.value_or(false) || It->second.second;
15162 if (*IsSigned)
15163 break;
15164 }
15165 }
15166 if (IsSigned.value_or(false))
15167 break;
15168 if (auto *EE = dyn_cast<ExtractElementInst>(V)) {
15169 IsSigned =
15170 IsSigned.value_or(false) ||
15171 !isKnownNonNegative(EE->getVectorOperand(), SimplifyQuery(*DL));
15172 continue;
15173 }
15174 if (IsSigned.value_or(false))
15175 break;
15176 }
15177 }
15178 if (IsSigned.value_or(false)) {
15179 // Final attempt - check user node.
15180 auto It = MinBWs.find(TE->UserTreeIndices.front().UserTE);
15181 if (It != MinBWs.end())
15182 IsSigned = It->second.second;
15183 }
15184 assert(IsSigned &&
15185 "Expected user node or perfect diamond match in MinBWs.");
15186 Vec = Builder.CreateIntCast(Vec, PrevVec->getType(), *IsSigned);
15187 }
15188 PrevVec->replaceAllUsesWith(Vec);
15189 PostponedValues.try_emplace(Vec).first->second.push_back(TE);
15190 // Replace the stub vector node, if it was used before for one of the
15191 // buildvector nodes already.
15192 auto It = PostponedValues.find(PrevVec);
15193 if (It != PostponedValues.end()) {
15194 for (TreeEntry *VTE : It->getSecond())
15195 VTE->VectorizedValue = Vec;
15196 }
15197 eraseInstruction(PrevVec);
15198 }
15199
15200 LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
15201 << " values .\n");
15202
15204 // Maps vector instruction to original insertelement instruction
15205 DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
15206 // Maps extract Scalar to the corresponding extractelement instruction in the
15207 // basic block. Only one extractelement per block should be emitted.
15209 ScalarToEEs;
15210 SmallDenseSet<Value *, 4> UsedInserts;
15212 SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
15214 // Extract all of the elements with the external uses.
15215 for (const auto &ExternalUse : ExternalUses) {
15216 Value *Scalar = ExternalUse.Scalar;
15217 llvm::User *User = ExternalUse.User;
15218
15219 // Skip users that we already RAUW. This happens when one instruction
15220 // has multiple uses of the same value.
15221 if (User && !is_contained(Scalar->users(), User))
15222 continue;
15223 TreeEntry *E = getTreeEntry(Scalar);
15224 assert(E && "Invalid scalar");
15225 assert(!E->isGather() && "Extracting from a gather list");
15226 // Non-instruction pointers are not deleted, just skip them.
15227 if (E->getOpcode() == Instruction::GetElementPtr &&
15228 !isa<GetElementPtrInst>(Scalar))
15229 continue;
15230
15231 Value *Vec = E->VectorizedValue;
15232 assert(Vec && "Can't find vectorizable value");
15233
15234 Value *Lane = Builder.getInt32(ExternalUse.Lane);
15235 auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
15236 if (Scalar->getType() != Vec->getType()) {
15237 Value *Ex = nullptr;
15238 Value *ExV = nullptr;
15239 auto *Inst = dyn_cast<Instruction>(Scalar);
15240 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);
15241 auto It = ScalarToEEs.find(Scalar);
15242 if (It != ScalarToEEs.end()) {
15243 // No need to emit many extracts, just move the only one in the
15244 // current block.
15245 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
15246 : Builder.GetInsertBlock());
15247 if (EEIt != It->second.end()) {
15248 Value *PrevV = EEIt->second.first;
15249 if (auto *I = dyn_cast<Instruction>(PrevV);
15250 I && !ReplaceInst &&
15251 Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
15252 Builder.GetInsertPoint()->comesBefore(I)) {
15253 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
15254 Builder.GetInsertPoint());
15255 if (auto *CI = dyn_cast<Instruction>(EEIt->second.second))
15256 CI->moveAfter(I);
15257 }
15258 Ex = PrevV;
15259 ExV = EEIt->second.second ? EEIt->second.second : Ex;
15260 }
15261 }
15262 if (!Ex) {
15263 // "Reuse" the existing extract to improve final codegen.
15264 if (ReplaceInst) {
15265 // Leave the instruction as is, if it cheaper extracts and all
15266 // operands are scalar.
15267 if (auto *EE = dyn_cast<ExtractElementInst>(Inst)) {
15268 IgnoredExtracts.insert(EE);
15269 Ex = EE;
15270 } else {
15271 auto *CloneInst = Inst->clone();
15272 CloneInst->insertBefore(Inst);
15273 if (Inst->hasName())
15274 CloneInst->takeName(Inst);
15275 Ex = CloneInst;
15276 }
15277 } else if (auto *ES = dyn_cast<ExtractElementInst>(Scalar);
15278 ES && isa<Instruction>(Vec)) {
15279 Value *V = ES->getVectorOperand();
15280 auto *IVec = cast<Instruction>(Vec);
15281 if (const TreeEntry *ETE = getTreeEntry(V))
15282 V = ETE->VectorizedValue;
15283 if (auto *IV = dyn_cast<Instruction>(V);
15284 !IV || IV == Vec || IV->getParent() != IVec->getParent() ||
15285 IV->comesBefore(IVec))
15286 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
15287 else
15288 Ex = Builder.CreateExtractElement(Vec, Lane);
15289 } else if (auto *VecTy =
15290 dyn_cast<FixedVectorType>(Scalar->getType())) {
15291 assert(SLPReVec && "FixedVectorType is not expected.");
15292 unsigned VecTyNumElements = VecTy->getNumElements();
15293 // When REVEC is enabled, we need to extract a vector.
15294 // Note: The element size of Scalar may be different from the
15295 // element size of Vec.
15296 Ex = Builder.CreateExtractVector(
15298 VecTyNumElements),
15299 Vec, Builder.getInt64(ExternalUse.Lane * VecTyNumElements));
15300 } else {
15301 Ex = Builder.CreateExtractElement(Vec, Lane);
15302 }
15303 // If necessary, sign-extend or zero-extend ScalarRoot
15304 // to the larger type.
15305 ExV = Ex;
15306 if (Scalar->getType() != Ex->getType())
15307 ExV = Builder.CreateIntCast(Ex, Scalar->getType(),
15308 MinBWs.find(E)->second.second);
15309 auto *I = dyn_cast<Instruction>(Ex);
15310 ScalarToEEs[Scalar].try_emplace(I ? I->getParent()
15311 : &F->getEntryBlock(),
15312 std::make_pair(Ex, ExV));
15313 }
15314 // The then branch of the previous if may produce constants, since 0
15315 // operand might be a constant.
15316 if (auto *ExI = dyn_cast<Instruction>(Ex);
15317 ExI && !isa<PHINode>(ExI) && !mayHaveNonDefUseDependency(*ExI)) {
15318 GatherShuffleExtractSeq.insert(ExI);
15319 CSEBlocks.insert(ExI->getParent());
15320 }
15321 return ExV;
15322 }
15323 assert(isa<FixedVectorType>(Scalar->getType()) &&
15324 isa<InsertElementInst>(Scalar) &&
15325 "In-tree scalar of vector type is not insertelement?");
15326 auto *IE = cast<InsertElementInst>(Scalar);
15327 VectorToInsertElement.try_emplace(Vec, IE);
15328 return Vec;
15329 };
15330 // If User == nullptr, the Scalar remains as scalar in vectorized
15331 // instructions or is used as extra arg. Generate ExtractElement instruction
15332 // and update the record for this scalar in ExternallyUsedValues.
15333 if (!User) {
15334 if (!ScalarsWithNullptrUser.insert(Scalar).second)
15335 continue;
15336 assert((ExternallyUsedValues.count(Scalar) ||
15337 Scalar->hasNUsesOrMore(UsesLimit) ||
15338 ExternalUsesAsOriginalScalar.contains(Scalar) ||
15339 any_of(Scalar->users(),
15340 [&](llvm::User *U) {
15341 if (ExternalUsesAsOriginalScalar.contains(U))
15342 return true;
15343 TreeEntry *UseEntry = getTreeEntry(U);
15344 return UseEntry &&
15345 (UseEntry->State == TreeEntry::Vectorize ||
15346 UseEntry->State ==
15347 TreeEntry::StridedVectorize) &&
15348 (E->State == TreeEntry::Vectorize ||
15349 E->State == TreeEntry::StridedVectorize) &&
15350 doesInTreeUserNeedToExtract(
15351 Scalar, getRootEntryInstruction(*UseEntry),
15352 TLI);
15353 })) &&
15354 "Scalar with nullptr User must be registered in "
15355 "ExternallyUsedValues map or remain as scalar in vectorized "
15356 "instructions");
15357 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
15358 if (auto *PHI = dyn_cast<PHINode>(VecI)) {
15359 if (PHI->getParent()->isLandingPad())
15360 Builder.SetInsertPoint(
15361 PHI->getParent(),
15362 std::next(
15363 PHI->getParent()->getLandingPadInst()->getIterator()));
15364 else
15365 Builder.SetInsertPoint(PHI->getParent(),
15366 PHI->getParent()->getFirstNonPHIIt());
15367 } else {
15368 Builder.SetInsertPoint(VecI->getParent(),
15369 std::next(VecI->getIterator()));
15370 }
15371 } else {
15372 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
15373 }
15374 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
15375 // Required to update internally referenced instructions.
15376 if (Scalar != NewInst) {
15377 assert((!isa<ExtractElementInst>(Scalar) ||
15378 !IgnoredExtracts.contains(cast<ExtractElementInst>(Scalar))) &&
15379 "Extractelements should not be replaced.");
15380 Scalar->replaceAllUsesWith(NewInst);
15381 }
15382 continue;
15383 }
15384
15385 if (auto *VU = dyn_cast<InsertElementInst>(User);
15386 VU && VU->getOperand(1) == Scalar) {
15387 // Skip if the scalar is another vector op or Vec is not an instruction.
15388 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
15389 if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
15390 if (!UsedInserts.insert(VU).second)
15391 continue;
15392 // Need to use original vector, if the root is truncated.
15393 auto BWIt = MinBWs.find(E);
15394 if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
15395 auto *ScalarTy = FTy->getElementType();
15396 auto Key = std::make_pair(Vec, ScalarTy);
15397 auto VecIt = VectorCasts.find(Key);
15398 if (VecIt == VectorCasts.end()) {
15399 IRBuilderBase::InsertPointGuard Guard(Builder);
15400 if (auto *IVec = dyn_cast<PHINode>(Vec)) {
15401 if (IVec->getParent()->isLandingPad())
15402 Builder.SetInsertPoint(IVec->getParent(),
15403 std::next(IVec->getParent()
15404 ->getLandingPadInst()
15405 ->getIterator()));
15406 else
15407 Builder.SetInsertPoint(
15408 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
15409 } else if (auto *IVec = dyn_cast<Instruction>(Vec)) {
15410 Builder.SetInsertPoint(IVec->getNextNonDebugInstruction());
15411 }
15412 Vec = Builder.CreateIntCast(
15413 Vec,
15415 ScalarTy,
15416 cast<FixedVectorType>(Vec->getType())->getNumElements()),
15417 BWIt->second.second);
15418 VectorCasts.try_emplace(Key, Vec);
15419 } else {
15420 Vec = VecIt->second;
15421 }
15422 }
15423
15424 std::optional<unsigned> InsertIdx = getElementIndex(VU);
15425 if (InsertIdx) {
15426 auto *It = find_if(
15427 ShuffledInserts, [VU](const ShuffledInsertData<Value *> &Data) {
15428 // Checks if 2 insertelements are from the same buildvector.
15429 InsertElementInst *VecInsert = Data.InsertElements.front();
15431 VU, VecInsert,
15432 [](InsertElementInst *II) { return II->getOperand(0); });
15433 });
15434 unsigned Idx = *InsertIdx;
15435 if (It == ShuffledInserts.end()) {
15436 (void)ShuffledInserts.emplace_back();
15437 It = std::next(ShuffledInserts.begin(),
15438 ShuffledInserts.size() - 1);
15439 }
15440 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
15441 if (Mask.empty())
15442 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
15443 Mask[Idx] = ExternalUse.Lane;
15444 It->InsertElements.push_back(cast<InsertElementInst>(User));
15445 continue;
15446 }
15447 }
15448 }
15449 }
15450
15451 // Generate extracts for out-of-tree users.
15452 // Find the insertion point for the extractelement lane.
15453 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
15454 if (PHINode *PH = dyn_cast<PHINode>(User)) {
15455 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
15456 if (PH->getIncomingValue(I) == Scalar) {
15457 Instruction *IncomingTerminator =
15458 PH->getIncomingBlock(I)->getTerminator();
15459 if (isa<CatchSwitchInst>(IncomingTerminator)) {
15460 Builder.SetInsertPoint(VecI->getParent(),
15461 std::next(VecI->getIterator()));
15462 } else {
15463 Builder.SetInsertPoint(PH->getIncomingBlock(I)->getTerminator());
15464 }
15465 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
15466 PH->setOperand(I, NewInst);
15467 }
15468 }
15469 } else {
15470 Builder.SetInsertPoint(cast<Instruction>(User));
15471 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
15472 User->replaceUsesOfWith(Scalar, NewInst);
15473 }
15474 } else {
15475 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
15476 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
15477 User->replaceUsesOfWith(Scalar, NewInst);
15478 }
15479
15480 LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
15481 }
15482
15483 auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
15484 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
15485 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
15486 int VF = cast<FixedVectorType>(V1->getType())->getNumElements();
15487 for (int I = 0, E = Mask.size(); I < E; ++I) {
15488 if (Mask[I] < VF)
15489 CombinedMask1[I] = Mask[I];
15490 else
15491 CombinedMask2[I] = Mask[I] - VF;
15492 }
15493 ShuffleInstructionBuilder ShuffleBuilder(
15494 cast<VectorType>(V1->getType())->getElementType(), Builder, *this);
15495 ShuffleBuilder.add(V1, CombinedMask1);
15496 if (V2)
15497 ShuffleBuilder.add(V2, CombinedMask2);
15498 return ShuffleBuilder.finalize({}, {});
15499 };
15500
15501 auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask,
15502 bool ForSingleMask) {
15503 unsigned VF = Mask.size();
15504 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
15505 if (VF != VecVF) {
15506 if (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
15507 Vec = CreateShuffle(Vec, nullptr, Mask);
15508 return std::make_pair(Vec, true);
15509 }
15510 if (!ForSingleMask) {
15511 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
15512 for (unsigned I = 0; I < VF; ++I) {
15513 if (Mask[I] != PoisonMaskElem)
15514 ResizeMask[Mask[I]] = Mask[I];
15515 }
15516 Vec = CreateShuffle(Vec, nullptr, ResizeMask);
15517 }
15518 }
15519
15520 return std::make_pair(Vec, false);
15521 };
15522 // Perform shuffling of the vectorize tree entries for better handling of
15523 // external extracts.
15524 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
15525 // Find the first and the last instruction in the list of insertelements.
15526 sort(ShuffledInserts[I].InsertElements, isFirstInsertElement);
15527 InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();
15528 InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();
15529 Builder.SetInsertPoint(LastInsert);
15530 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
15532 MutableArrayRef(Vector.data(), Vector.size()),
15533 FirstInsert->getOperand(0),
15534 [](Value *Vec) {
15535 return cast<VectorType>(Vec->getType())
15536 ->getElementCount()
15537 .getKnownMinValue();
15538 },
15539 ResizeToVF,
15540 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
15541 ArrayRef<Value *> Vals) {
15542 assert((Vals.size() == 1 || Vals.size() == 2) &&
15543 "Expected exactly 1 or 2 input values.");
15544 if (Vals.size() == 1) {
15545 // Do not create shuffle if the mask is a simple identity
15546 // non-resizing mask.
15547 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
15548 ->getNumElements() ||
15549 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
15550 return CreateShuffle(Vals.front(), nullptr, Mask);
15551 return Vals.front();
15552 }
15553 return CreateShuffle(Vals.front() ? Vals.front()
15554 : FirstInsert->getOperand(0),
15555 Vals.back(), Mask);
15556 });
15557 auto It = ShuffledInserts[I].InsertElements.rbegin();
15558 // Rebuild buildvector chain.
15559 InsertElementInst *II = nullptr;
15560 if (It != ShuffledInserts[I].InsertElements.rend())
15561 II = *It;
15563 while (It != ShuffledInserts[I].InsertElements.rend()) {
15564 assert(II && "Must be an insertelement instruction.");
15565 if (*It == II)
15566 ++It;
15567 else
15568 Inserts.push_back(cast<Instruction>(II));
15569 II = dyn_cast<InsertElementInst>(II->getOperand(0));
15570 }
15571 for (Instruction *II : reverse(Inserts)) {
15572 II->replaceUsesOfWith(II->getOperand(0), NewInst);
15573 if (auto *NewI = dyn_cast<Instruction>(NewInst))
15574 if (II->getParent() == NewI->getParent() && II->comesBefore(NewI))
15575 II->moveAfter(NewI);
15576 NewInst = II;
15577 }
15578 LastInsert->replaceAllUsesWith(NewInst);
15579 for (InsertElementInst *IE : reverse(ShuffledInserts[I].InsertElements)) {
15580 IE->replaceUsesOfWith(IE->getOperand(0),
15581 PoisonValue::get(IE->getOperand(0)->getType()));
15582 IE->replaceUsesOfWith(IE->getOperand(1),
15583 PoisonValue::get(IE->getOperand(1)->getType()));
15584 eraseInstruction(IE);
15585 }
15586 CSEBlocks.insert(LastInsert->getParent());
15587 }
15588
15589 SmallVector<Instruction *> RemovedInsts;
15590 // For each vectorized value:
15591 for (auto &TEPtr : VectorizableTree) {
15592 TreeEntry *Entry = TEPtr.get();
15593
15594 // No need to handle users of gathered values.
15595 if (Entry->isGather())
15596 continue;
15597
15598 assert(Entry->VectorizedValue && "Can't find vectorizable value");
15599
15600 // For each lane:
15601 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
15602 Value *Scalar = Entry->Scalars[Lane];
15603
15604 if (Entry->getOpcode() == Instruction::GetElementPtr &&
15605 !isa<GetElementPtrInst>(Scalar))
15606 continue;
15607 if (auto *EE = dyn_cast<ExtractElementInst>(Scalar);
15608 EE && IgnoredExtracts.contains(EE))
15609 continue;
15610#ifndef NDEBUG
15611 Type *Ty = Scalar->getType();
15612 if (!Ty->isVoidTy()) {
15613 for (User *U : Scalar->users()) {
15614 LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
15615
15616 // It is legal to delete users in the ignorelist.
15617 assert((getTreeEntry(U) ||
15618 (UserIgnoreList && UserIgnoreList->contains(U)) ||
15621 "Deleting out-of-tree value");
15622 }
15623 }
15624#endif
15625 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
15626 auto *I = cast<Instruction>(Scalar);
15627 RemovedInsts.push_back(I);
15628 }
15629 }
15630
15631 // Merge the DIAssignIDs from the about-to-be-deleted instructions into the
15632 // new vector instruction.
15633 if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
15634 V->mergeDIAssignID(RemovedInsts);
15635
15636 // Clear up reduction references, if any.
15637 if (UserIgnoreList) {
15638 for (Instruction *I : RemovedInsts) {
15639 const TreeEntry *IE = getTreeEntry(I);
15640 if (IE->Idx != 0 &&
15641 !(VectorizableTree.front()->isGather() && isa<LoadInst>(I) &&
15642 !IE->UserTreeIndices.empty() &&
15643 any_of(IE->UserTreeIndices,
15644 [&](const EdgeInfo &EI) {
15645 return EI.UserTE == VectorizableTree.front().get() &&
15646 EI.EdgeIdx == UINT_MAX;
15647 })) &&
15648 !(GatheredLoadsEntriesFirst != NoGatheredLoads &&
15649 IE->Idx >= GatheredLoadsEntriesFirst &&
15650 VectorizableTree.front()->isGather() &&
15651 is_contained(VectorizableTree.front()->Scalars, I)))
15652 continue;
15653 SmallVector<SelectInst *> LogicalOpSelects;
15654 I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) {
15655 // Do not replace condition of the logical op in form select <cond>.
15656 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
15657 (match(U.getUser(), m_LogicalAnd()) ||
15658 match(U.getUser(), m_LogicalOr())) &&
15659 U.getOperandNo() == 0;
15660 if (IsPoisoningLogicalOp) {
15661 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
15662 return false;
15663 }
15664 return UserIgnoreList->contains(U.getUser());
15665 });
15666 // Replace conditions of the poisoning logical ops with the non-poison
15667 // constant value.
15668 for (SelectInst *SI : LogicalOpSelects)
15669 SI->setCondition(Constant::getNullValue(SI->getCondition()->getType()));
15670 }
15671 }
15672 // Retain to-be-deleted instructions for some debug-info bookkeeping and alias
15673 // cache correctness.
15674 // NOTE: removeInstructionAndOperands only marks the instruction for deletion
15675 // - instructions are not deleted until later.
15677
15678 Builder.ClearInsertionPoint();
15679 InstrElementSize.clear();
15680
15681 const TreeEntry &RootTE = *VectorizableTree.front();
15682 Value *Vec = RootTE.VectorizedValue;
15683 if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
15684 It != MinBWs.end() &&
15685 ReductionBitWidth != It->second.first) {
15686 IRBuilder<>::InsertPointGuard Guard(Builder);
15687 Builder.SetInsertPoint(ReductionRoot->getParent(),
15688 ReductionRoot->getIterator());
15689 Vec = Builder.CreateIntCast(
15690 Vec,
15691 VectorType::get(Builder.getIntNTy(ReductionBitWidth),
15692 cast<VectorType>(Vec->getType())->getElementCount()),
15693 It->second.second);
15694 }
15695 return Vec;
15696}
15697
15699 LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size()
15700 << " gather sequences instructions.\n");
15701 // LICM InsertElementInst sequences.
15702 for (Instruction *I : GatherShuffleExtractSeq) {
15703 if (isDeleted(I))
15704 continue;
15705
15706 // Check if this block is inside a loop.
15707 Loop *L = LI->getLoopFor(I->getParent());
15708 if (!L)
15709 continue;
15710
15711 // Check if it has a preheader.
15712 BasicBlock *PreHeader = L->getLoopPreheader();
15713 if (!PreHeader)
15714 continue;
15715
15716 // If the vector or the element that we insert into it are
15717 // instructions that are defined in this basic block then we can't
15718 // hoist this instruction.
15719 if (any_of(I->operands(), [L](Value *V) {
15720 auto *OpI = dyn_cast<Instruction>(V);
15721 return OpI && L->contains(OpI);
15722 }))
15723 continue;
15724
15725 // We can hoist this instruction. Move it to the pre-header.
15726 I->moveBefore(PreHeader->getTerminator());
15727 CSEBlocks.insert(PreHeader);
15728 }
15729
15730 // Make a list of all reachable blocks in our CSE queue.
15732 CSEWorkList.reserve(CSEBlocks.size());
15733 for (BasicBlock *BB : CSEBlocks)
15734 if (DomTreeNode *N = DT->getNode(BB)) {
15736 CSEWorkList.push_back(N);
15737 }
15738
15739 // Sort blocks by domination. This ensures we visit a block after all blocks
15740 // dominating it are visited.
15741 llvm::sort(CSEWorkList, [](const DomTreeNode *A, const DomTreeNode *B) {
15742 assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&
15743 "Different nodes should have different DFS numbers");
15744 return A->getDFSNumIn() < B->getDFSNumIn();
15745 });
15746
15747 // Less defined shuffles can be replaced by the more defined copies.
15748 // Between two shuffles one is less defined if it has the same vector operands
15749 // and its mask indeces are the same as in the first one or undefs. E.g.
15750 // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
15751 // poison, <0, 0, 0, 0>.
15752 auto &&IsIdenticalOrLessDefined = [this](Instruction *I1, Instruction *I2,
15753 SmallVectorImpl<int> &NewMask) {
15754 if (I1->getType() != I2->getType())
15755 return false;
15756 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
15757 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
15758 if (!SI1 || !SI2)
15759 return I1->isIdenticalTo(I2);
15760 if (SI1->isIdenticalTo(SI2))
15761 return true;
15762 for (int I = 0, E = SI1->getNumOperands(); I < E; ++I)
15763 if (SI1->getOperand(I) != SI2->getOperand(I))
15764 return false;
15765 // Check if the second instruction is more defined than the first one.
15766 NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
15767 ArrayRef<int> SM1 = SI1->getShuffleMask();
15768 // Count trailing undefs in the mask to check the final number of used
15769 // registers.
15770 unsigned LastUndefsCnt = 0;
15771 for (int I = 0, E = NewMask.size(); I < E; ++I) {
15772 if (SM1[I] == PoisonMaskElem)
15773 ++LastUndefsCnt;
15774 else
15775 LastUndefsCnt = 0;
15776 if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem &&
15777 NewMask[I] != SM1[I])
15778 return false;
15779 if (NewMask[I] == PoisonMaskElem)
15780 NewMask[I] = SM1[I];
15781 }
15782 // Check if the last undefs actually change the final number of used vector
15783 // registers.
15784 return SM1.size() - LastUndefsCnt > 1 &&
15785 TTI->getNumberOfParts(SI1->getType()) ==
15787 getWidenedType(SI1->getType()->getElementType(),
15788 SM1.size() - LastUndefsCnt));
15789 };
15790 // Perform O(N^2) search over the gather/shuffle sequences and merge identical
15791 // instructions. TODO: We can further optimize this scan if we split the
15792 // instructions into different buckets based on the insert lane.
15794 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
15795 assert(*I &&
15796 (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
15797 "Worklist not sorted properly!");
15798 BasicBlock *BB = (*I)->getBlock();
15799 // For all instructions in blocks containing gather sequences:
15800 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
15801 if (isDeleted(&In))
15802 continue;
15804 !GatherShuffleExtractSeq.contains(&In))
15805 continue;
15806
15807 // Check if we can replace this instruction with any of the
15808 // visited instructions.
15809 bool Replaced = false;
15810 for (Instruction *&V : Visited) {
15811 SmallVector<int> NewMask;
15812 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
15813 DT->dominates(V->getParent(), In.getParent())) {
15814 In.replaceAllUsesWith(V);
15815 eraseInstruction(&In);
15816 if (auto *SI = dyn_cast<ShuffleVectorInst>(V))
15817 if (!NewMask.empty())
15818 SI->setShuffleMask(NewMask);
15819 Replaced = true;
15820 break;
15821 }
15823 GatherShuffleExtractSeq.contains(V) &&
15824 IsIdenticalOrLessDefined(V, &In, NewMask) &&
15825 DT->dominates(In.getParent(), V->getParent())) {
15826 In.moveAfter(V);
15827 V->replaceAllUsesWith(&In);
15829 if (auto *SI = dyn_cast<ShuffleVectorInst>(&In))
15830 if (!NewMask.empty())
15831 SI->setShuffleMask(NewMask);
15832 V = &In;
15833 Replaced = true;
15834 break;
15835 }
15836 }
15837 if (!Replaced) {
15838 assert(!is_contained(Visited, &In));
15839 Visited.push_back(&In);
15840 }
15841 }
15842 }
15843 CSEBlocks.clear();
15844 GatherShuffleExtractSeq.clear();
15845}
15846
15847BoUpSLP::ScheduleData *
15848BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
15849 ScheduleData *Bundle = nullptr;
15850 ScheduleData *PrevInBundle = nullptr;
15851 for (Value *V : VL) {
15853 continue;
15854 ScheduleData *BundleMember = getScheduleData(V);
15855 assert(BundleMember &&
15856 "no ScheduleData for bundle member "
15857 "(maybe not in same basic block)");
15858 assert(BundleMember->isSchedulingEntity() &&
15859 "bundle member already part of other bundle");
15860 if (PrevInBundle) {
15861 PrevInBundle->NextInBundle = BundleMember;
15862 } else {
15863 Bundle = BundleMember;
15864 }
15865
15866 // Group the instructions to a bundle.
15867 BundleMember->FirstInBundle = Bundle;
15868 PrevInBundle = BundleMember;
15869 }
15870 assert(Bundle && "Failed to find schedule bundle");
15871 return Bundle;
15872}
15873
15874// Groups the instructions to a bundle (which is then a single scheduling entity)
15875// and schedules instructions until the bundle gets ready.
15876std::optional<BoUpSLP::ScheduleData *>
15877BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
15878 const InstructionsState &S) {
15879 // No need to schedule PHIs, insertelement, extractelement and extractvalue
15880 // instructions.
15881 if (isa<PHINode>(S.OpValue) || isVectorLikeInstWithConstOps(S.OpValue) ||
15883 return nullptr;
15884
15885 // Initialize the instruction bundle.
15886 Instruction *OldScheduleEnd = ScheduleEnd;
15887 LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.OpValue << "\n");
15888
15889 auto TryScheduleBundleImpl = [this, OldScheduleEnd, SLP](bool ReSchedule,
15890 ScheduleData *Bundle) {
15891 // The scheduling region got new instructions at the lower end (or it is a
15892 // new region for the first bundle). This makes it necessary to
15893 // recalculate all dependencies.
15894 // It is seldom that this needs to be done a second time after adding the
15895 // initial bundle to the region.
15896 if (ScheduleEnd != OldScheduleEnd) {
15897 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode())
15898 if (ScheduleData *SD = getScheduleData(I))
15899 SD->clearDependencies();
15900 ReSchedule = true;
15901 }
15902 if (Bundle) {
15903 LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle
15904 << " in block " << BB->getName() << "\n");
15905 calculateDependencies(Bundle, /*InsertInReadyList=*/true, SLP);
15906 }
15907
15908 if (ReSchedule) {
15909 resetSchedule();
15910 initialFillReadyList(ReadyInsts);
15911 }
15912
15913 // Now try to schedule the new bundle or (if no bundle) just calculate
15914 // dependencies. As soon as the bundle is "ready" it means that there are no
15915 // cyclic dependencies and we can schedule it. Note that's important that we
15916 // don't "schedule" the bundle yet (see cancelScheduling).
15917 while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
15918 !ReadyInsts.empty()) {
15919 ScheduleData *Picked = ReadyInsts.pop_back_val();
15920 assert(Picked->isSchedulingEntity() && Picked->isReady() &&
15921 "must be ready to schedule");
15922 schedule(Picked, ReadyInsts);
15923 }
15924 };
15925
15926 // Make sure that the scheduling region contains all
15927 // instructions of the bundle.
15928 for (Value *V : VL) {
15930 continue;
15931 if (!extendSchedulingRegion(V, S)) {
15932 // If the scheduling region got new instructions at the lower end (or it
15933 // is a new region for the first bundle). This makes it necessary to
15934 // recalculate all dependencies.
15935 // Otherwise the compiler may crash trying to incorrectly calculate
15936 // dependencies and emit instruction in the wrong order at the actual
15937 // scheduling.
15938 TryScheduleBundleImpl(/*ReSchedule=*/false, nullptr);
15939 return std::nullopt;
15940 }
15941 }
15942
15943 bool ReSchedule = false;
15944 for (Value *V : VL) {
15946 continue;
15947 ScheduleData *BundleMember = getScheduleData(V);
15948 assert(BundleMember &&
15949 "no ScheduleData for bundle member (maybe not in same basic block)");
15950
15951 // Make sure we don't leave the pieces of the bundle in the ready list when
15952 // whole bundle might not be ready.
15953 ReadyInsts.remove(BundleMember);
15954
15955 if (!BundleMember->IsScheduled)
15956 continue;
15957 // A bundle member was scheduled as single instruction before and now
15958 // needs to be scheduled as part of the bundle. We just get rid of the
15959 // existing schedule.
15960 LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
15961 << " was already scheduled\n");
15962 ReSchedule = true;
15963 }
15964
15965 auto *Bundle = buildBundle(VL);
15966 TryScheduleBundleImpl(ReSchedule, Bundle);
15967 if (!Bundle->isReady()) {
15968 cancelScheduling(VL, S.OpValue);
15969 return std::nullopt;
15970 }
15971 return Bundle;
15972}
15973
15974void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
15975 Value *OpValue) {
15976 if (isa<PHINode>(OpValue) || isVectorLikeInstWithConstOps(OpValue) ||
15978 return;
15979
15980 if (doesNotNeedToBeScheduled(OpValue))
15981 OpValue = *find_if_not(VL, doesNotNeedToBeScheduled);
15982 ScheduleData *Bundle = getScheduleData(OpValue);
15983 LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
15984 assert(!Bundle->IsScheduled &&
15985 "Can't cancel bundle which is already scheduled");
15986 assert(Bundle->isSchedulingEntity() &&
15987 (Bundle->isPartOfBundle() || needToScheduleSingleInstruction(VL)) &&
15988 "tried to unbundle something which is not a bundle");
15989
15990 // Remove the bundle from the ready list.
15991 if (Bundle->isReady())
15992 ReadyInsts.remove(Bundle);
15993
15994 // Un-bundle: make single instructions out of the bundle.
15995 ScheduleData *BundleMember = Bundle;
15996 while (BundleMember) {
15997 assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
15998 BundleMember->FirstInBundle = BundleMember;
15999 ScheduleData *Next = BundleMember->NextInBundle;
16000 BundleMember->NextInBundle = nullptr;
16001 BundleMember->TE = nullptr;
16002 if (BundleMember->unscheduledDepsInBundle() == 0) {
16003 ReadyInsts.insert(BundleMember);
16004 }
16005 BundleMember = Next;
16006 }
16007}
16008
16009BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
16010 // Allocate a new ScheduleData for the instruction.
16011 if (ChunkPos >= ChunkSize) {
16012 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
16013 ChunkPos = 0;
16014 }
16015 return &(ScheduleDataChunks.back()[ChunkPos++]);
16016}
16017
16018bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
16019 Value *V, const InstructionsState &S) {
16021 assert(I && "bundle member must be an instruction");
16024 "phi nodes/insertelements/extractelements/extractvalues don't need to "
16025 "be scheduled");
16026 if (getScheduleData(I))
16027 return true;
16028 if (!ScheduleStart) {
16029 // It's the first instruction in the new region.
16030 initScheduleData(I, I->getNextNode(), nullptr, nullptr);
16031 ScheduleStart = I;
16032 ScheduleEnd = I->getNextNode();
16033 assert(ScheduleEnd && "tried to vectorize a terminator?");
16034 LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
16035 return true;
16036 }
16037 // Search up and down at the same time, because we don't know if the new
16038 // instruction is above or below the existing scheduling region.
16039 // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
16040 // against the budget. Otherwise debug info could affect codegen.
16042 ++ScheduleStart->getIterator().getReverse();
16043 BasicBlock::reverse_iterator UpperEnd = BB->rend();
16044 BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
16045 BasicBlock::iterator LowerEnd = BB->end();
16046 auto IsAssumeLikeIntr = [](const Instruction &I) {
16047 if (auto *II = dyn_cast<IntrinsicInst>(&I))
16048 return II->isAssumeLikeIntrinsic();
16049 return false;
16050 };
16051 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
16052 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
16053 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
16054 &*DownIter != I) {
16055 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
16056 LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
16057 return false;
16058 }
16059
16060 ++UpIter;
16061 ++DownIter;
16062
16063 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
16064 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
16065 }
16066 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {
16067 assert(I->getParent() == ScheduleStart->getParent() &&
16068 "Instruction is in wrong basic block.");
16069 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
16070 ScheduleStart = I;
16071 LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
16072 << "\n");
16073 return true;
16074 }
16075 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&
16076 "Expected to reach top of the basic block or instruction down the "
16077 "lower end.");
16078 assert(I->getParent() == ScheduleEnd->getParent() &&
16079 "Instruction is in wrong basic block.");
16080 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
16081 nullptr);
16082 ScheduleEnd = I->getNextNode();
16083 assert(ScheduleEnd && "tried to vectorize a terminator?");
16084 LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
16085 return true;
16086}
16087
16088void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
16089 Instruction *ToI,
16090 ScheduleData *PrevLoadStore,
16091 ScheduleData *NextLoadStore) {
16092 ScheduleData *CurrentLoadStore = PrevLoadStore;
16093 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
16094 // No need to allocate data for non-schedulable instructions.
16096 continue;
16097 ScheduleData *SD = ScheduleDataMap.lookup(I);
16098 if (!SD) {
16099 SD = allocateScheduleDataChunks();
16100 ScheduleDataMap[I] = SD;
16101 }
16102 assert(!isInSchedulingRegion(SD) &&
16103 "new ScheduleData already in scheduling region");
16104 SD->init(SchedulingRegionID, I);
16105
16106 if (I->mayReadOrWriteMemory() &&
16107 (!isa<IntrinsicInst>(I) ||
16108 (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&
16109 cast<IntrinsicInst>(I)->getIntrinsicID() !=
16110 Intrinsic::pseudoprobe))) {
16111 // Update the linked list of memory accessing instructions.
16112 if (CurrentLoadStore) {
16113 CurrentLoadStore->NextLoadStore = SD;
16114 } else {
16115 FirstLoadStoreInRegion = SD;
16116 }
16117 CurrentLoadStore = SD;
16118 }
16119
16122 RegionHasStackSave = true;
16123 }
16124 if (NextLoadStore) {
16125 if (CurrentLoadStore)
16126 CurrentLoadStore->NextLoadStore = NextLoadStore;
16127 } else {
16128 LastLoadStoreInRegion = CurrentLoadStore;
16129 }
16130}
16131
16132void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
16133 bool InsertInReadyList,
16134 BoUpSLP *SLP) {
16135 assert(SD->isSchedulingEntity());
16136
16138 WorkList.push_back(SD);
16139
16140 while (!WorkList.empty()) {
16141 ScheduleData *SD = WorkList.pop_back_val();
16142 for (ScheduleData *BundleMember = SD; BundleMember;
16143 BundleMember = BundleMember->NextInBundle) {
16144 assert(isInSchedulingRegion(BundleMember));
16145 if (BundleMember->hasValidDependencies())
16146 continue;
16147
16148 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember
16149 << "\n");
16150 BundleMember->Dependencies = 0;
16151 BundleMember->resetUnscheduledDeps();
16152
16153 // Handle def-use chain dependencies.
16154 for (User *U : BundleMember->Inst->users()) {
16155 if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
16156 BundleMember->Dependencies++;
16157 ScheduleData *DestBundle = UseSD->FirstInBundle;
16158 if (!DestBundle->IsScheduled)
16159 BundleMember->incrementUnscheduledDeps(1);
16160 if (!DestBundle->hasValidDependencies())
16161 WorkList.push_back(DestBundle);
16162 }
16163 }
16164
16165 auto MakeControlDependent = [&](Instruction *I) {
16166 auto *DepDest = getScheduleData(I);
16167 assert(DepDest && "must be in schedule window");
16168 DepDest->ControlDependencies.push_back(BundleMember);
16169 BundleMember->Dependencies++;
16170 ScheduleData *DestBundle = DepDest->FirstInBundle;
16171 if (!DestBundle->IsScheduled)
16172 BundleMember->incrementUnscheduledDeps(1);
16173 if (!DestBundle->hasValidDependencies())
16174 WorkList.push_back(DestBundle);
16175 };
16176
16177 // Any instruction which isn't safe to speculate at the beginning of the
16178 // block is control dependend on any early exit or non-willreturn call
16179 // which proceeds it.
16180 if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->Inst)) {
16181 for (Instruction *I = BundleMember->Inst->getNextNode();
16182 I != ScheduleEnd; I = I->getNextNode()) {
16183 if (isSafeToSpeculativelyExecute(I, &*BB->begin(), SLP->AC))
16184 continue;
16185
16186 // Add the dependency
16187 MakeControlDependent(I);
16188
16190 // Everything past here must be control dependent on I.
16191 break;
16192 }
16193 }
16194
16195 if (RegionHasStackSave) {
16196 // If we have an inalloc alloca instruction, it needs to be scheduled
16197 // after any preceeding stacksave. We also need to prevent any alloca
16198 // from reordering above a preceeding stackrestore.
16199 if (match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||
16200 match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {
16201 for (Instruction *I = BundleMember->Inst->getNextNode();
16202 I != ScheduleEnd; I = I->getNextNode()) {
16205 // Any allocas past here must be control dependent on I, and I
16206 // must be memory dependend on BundleMember->Inst.
16207 break;
16208
16209 if (!isa<AllocaInst>(I))
16210 continue;
16211
16212 // Add the dependency
16213 MakeControlDependent(I);
16214 }
16215 }
16216
16217 // In addition to the cases handle just above, we need to prevent
16218 // allocas and loads/stores from moving below a stacksave or a
16219 // stackrestore. Avoiding moving allocas below stackrestore is currently
16220 // thought to be conservatism. Moving loads/stores below a stackrestore
16221 // can lead to incorrect code.
16222 if (isa<AllocaInst>(BundleMember->Inst) ||
16223 BundleMember->Inst->mayReadOrWriteMemory()) {
16224 for (Instruction *I = BundleMember->Inst->getNextNode();
16225 I != ScheduleEnd; I = I->getNextNode()) {
16228 continue;
16229
16230 // Add the dependency
16231 MakeControlDependent(I);
16232 break;
16233 }
16234 }
16235 }
16236
16237 // Handle the memory dependencies (if any).
16238 ScheduleData *DepDest = BundleMember->NextLoadStore;
16239 if (!DepDest)
16240 continue;
16241 Instruction *SrcInst = BundleMember->Inst;
16242 assert(SrcInst->mayReadOrWriteMemory() &&
16243 "NextLoadStore list for non memory effecting bundle?");
16244 MemoryLocation SrcLoc = getLocation(SrcInst);
16245 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
16246 unsigned NumAliased = 0;
16247 unsigned DistToSrc = 1;
16248
16249 for (; DepDest; DepDest = DepDest->NextLoadStore) {
16250 assert(isInSchedulingRegion(DepDest));
16251
16252 // We have two limits to reduce the complexity:
16253 // 1) AliasedCheckLimit: It's a small limit to reduce calls to
16254 // SLP->isAliased (which is the expensive part in this loop).
16255 // 2) MaxMemDepDistance: It's for very large blocks and it aborts
16256 // the whole loop (even if the loop is fast, it's quadratic).
16257 // It's important for the loop break condition (see below) to
16258 // check this limit even between two read-only instructions.
16259 if (DistToSrc >= MaxMemDepDistance ||
16260 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
16261 (NumAliased >= AliasedCheckLimit ||
16262 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
16263
16264 // We increment the counter only if the locations are aliased
16265 // (instead of counting all alias checks). This gives a better
16266 // balance between reduced runtime and accurate dependencies.
16267 NumAliased++;
16268
16269 DepDest->MemoryDependencies.push_back(BundleMember);
16270 BundleMember->Dependencies++;
16271 ScheduleData *DestBundle = DepDest->FirstInBundle;
16272 if (!DestBundle->IsScheduled) {
16273 BundleMember->incrementUnscheduledDeps(1);
16274 }
16275 if (!DestBundle->hasValidDependencies()) {
16276 WorkList.push_back(DestBundle);
16277 }
16278 }
16279
16280 // Example, explaining the loop break condition: Let's assume our
16281 // starting instruction is i0 and MaxMemDepDistance = 3.
16282 //
16283 // +--------v--v--v
16284 // i0,i1,i2,i3,i4,i5,i6,i7,i8
16285 // +--------^--^--^
16286 //
16287 // MaxMemDepDistance let us stop alias-checking at i3 and we add
16288 // dependencies from i0 to i3,i4,.. (even if they are not aliased).
16289 // Previously we already added dependencies from i3 to i6,i7,i8
16290 // (because of MaxMemDepDistance). As we added a dependency from
16291 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
16292 // and we can abort this loop at i6.
16293 if (DistToSrc >= 2 * MaxMemDepDistance)
16294 break;
16295 DistToSrc++;
16296 }
16297 }
16298 if (InsertInReadyList && SD->isReady()) {
16299 ReadyInsts.insert(SD);
16300 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst
16301 << "\n");
16302 }
16303 }
16304}
16305
16306void BoUpSLP::BlockScheduling::resetSchedule() {
16307 assert(ScheduleStart &&
16308 "tried to reset schedule on block which has not been scheduled");
16309 for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
16310 if (ScheduleData *SD = getScheduleData(I)) {
16311 assert(isInSchedulingRegion(SD) &&
16312 "ScheduleData not in scheduling region");
16313 SD->IsScheduled = false;
16314 SD->resetUnscheduledDeps();
16315 }
16316 }
16317 ReadyInsts.clear();
16318}
16319
16320void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
16321 if (!BS->ScheduleStart)
16322 return;
16323
16324 LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
16325
16326 // A key point - if we got here, pre-scheduling was able to find a valid
16327 // scheduling of the sub-graph of the scheduling window which consists
16328 // of all vector bundles and their transitive users. As such, we do not
16329 // need to reschedule anything *outside of* that subgraph.
16330
16331 BS->resetSchedule();
16332
16333 // For the real scheduling we use a more sophisticated ready-list: it is
16334 // sorted by the original instruction location. This lets the final schedule
16335 // be as close as possible to the original instruction order.
16336 // WARNING: If changing this order causes a correctness issue, that means
16337 // there is some missing dependence edge in the schedule data graph.
16338 struct ScheduleDataCompare {
16339 bool operator()(ScheduleData *SD1, ScheduleData *SD2) const {
16340 return SD2->SchedulingPriority < SD1->SchedulingPriority;
16341 }
16342 };
16343 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
16344
16345 // Ensure that all dependency data is updated (for nodes in the sub-graph)
16346 // and fill the ready-list with initial instructions.
16347 int Idx = 0;
16348 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
16349 I = I->getNextNode()) {
16350 if (ScheduleData *SD = BS->getScheduleData(I)) {
16351 TreeEntry *SDTE = getTreeEntry(SD->Inst);
16352 (void)SDTE;
16354 SD->isPartOfBundle() ==
16355 (SDTE && !doesNotNeedToSchedule(SDTE->Scalars))) &&
16356 "scheduler and vectorizer bundle mismatch");
16357 SD->FirstInBundle->SchedulingPriority = Idx++;
16358
16359 if (SD->isSchedulingEntity() && SD->isPartOfBundle())
16360 BS->calculateDependencies(SD, false, this);
16361 }
16362 }
16363 BS->initialFillReadyList(ReadyInsts);
16364
16365 Instruction *LastScheduledInst = BS->ScheduleEnd;
16366
16367 // Do the "real" scheduling.
16368 while (!ReadyInsts.empty()) {
16369 ScheduleData *Picked = *ReadyInsts.begin();
16370 ReadyInsts.erase(ReadyInsts.begin());
16371
16372 // Move the scheduled instruction(s) to their dedicated places, if not
16373 // there yet.
16374 for (ScheduleData *BundleMember = Picked; BundleMember;
16375 BundleMember = BundleMember->NextInBundle) {
16376 Instruction *PickedInst = BundleMember->Inst;
16377 if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst)
16378 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
16379 LastScheduledInst = PickedInst;
16380 }
16381
16382 BS->schedule(Picked, ReadyInsts);
16383 }
16384
16385 // Check that we didn't break any of our invariants.
16386#ifdef EXPENSIVE_CHECKS
16387 BS->verify();
16388#endif
16389
16390#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
16391 // Check that all schedulable entities got scheduled
16392 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) {
16393 ScheduleData *SD = BS->getScheduleData(I);
16394 if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies())
16395 assert(SD->IsScheduled && "must be scheduled at this point");
16396 }
16397#endif
16398
16399 // Avoid duplicate scheduling of the block.
16400 BS->ScheduleStart = nullptr;
16401}
16402
16404 // If V is a store, just return the width of the stored value (or value
16405 // truncated just before storing) without traversing the expression tree.
16406 // This is the common case.
16407 if (auto *Store = dyn_cast<StoreInst>(V))
16408 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
16409
16410 if (auto *IEI = dyn_cast<InsertElementInst>(V))
16411 return getVectorElementSize(IEI->getOperand(1));
16412
16413 auto E = InstrElementSize.find(V);
16414 if (E != InstrElementSize.end())
16415 return E->second;
16416
16417 // If V is not a store, we can traverse the expression tree to find loads
16418 // that feed it. The type of the loaded value may indicate a more suitable
16419 // width than V's type. We want to base the vector element size on the width
16420 // of memory operations where possible.
16423 if (auto *I = dyn_cast<Instruction>(V)) {
16424 Worklist.emplace_back(I, I->getParent(), 0);
16425 Visited.insert(I);
16426 }
16427
16428 // Traverse the expression tree in bottom-up order looking for loads. If we
16429 // encounter an instruction we don't yet handle, we give up.
16430 auto Width = 0u;
16431 Value *FirstNonBool = nullptr;
16432 while (!Worklist.empty()) {
16433 auto [I, Parent, Level] = Worklist.pop_back_val();
16434
16435 // We should only be looking at scalar instructions here. If the current
16436 // instruction has a vector type, skip.
16437 auto *Ty = I->getType();
16438 if (isa<VectorType>(Ty))
16439 continue;
16440 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
16441 FirstNonBool = I;
16442 if (Level > RecursionMaxDepth)
16443 continue;
16444
16445 // If the current instruction is a load, update MaxWidth to reflect the
16446 // width of the loaded value.
16448 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
16449
16450 // Otherwise, we need to visit the operands of the instruction. We only
16451 // handle the interesting cases from buildTree here. If an operand is an
16452 // instruction we haven't yet visited and from the same basic block as the
16453 // user or the use is a PHI node, we add it to the worklist.
16456 for (Use &U : I->operands()) {
16457 if (auto *J = dyn_cast<Instruction>(U.get()))
16458 if (Visited.insert(J).second &&
16459 (isa<PHINode>(I) || J->getParent() == Parent)) {
16460 Worklist.emplace_back(J, J->getParent(), Level + 1);
16461 continue;
16462 }
16463 if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())
16464 FirstNonBool = U.get();
16465 }
16466 } else {
16467 break;
16468 }
16469 }
16470
16471 // If we didn't encounter a memory access in the expression tree, or if we
16472 // gave up for some reason, just return the width of V. Otherwise, return the
16473 // maximum width we found.
16474 if (!Width) {
16475 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
16476 V = FirstNonBool;
16477 Width = DL->getTypeSizeInBits(V->getType());
16478 }
16479
16480 for (Instruction *I : Visited)
16481 InstrElementSize[I] = Width;
16482
16483 return Width;
16484}
16485
16486bool BoUpSLP::collectValuesToDemote(
16487 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
16489 unsigned &MaxDepthLevel, bool &IsProfitableToDemote,
16490 bool IsTruncRoot) const {
16491 // We can always demote constants.
16492 if (all_of(E.Scalars, IsaPred<Constant>))
16493 return true;
16494
16495 unsigned OrigBitWidth =
16496 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
16497 if (OrigBitWidth == BitWidth) {
16498 MaxDepthLevel = 1;
16499 return true;
16500 }
16501
16502 // If the value is not a vectorized instruction in the expression and not used
16503 // by the insertelement instruction and not used in multiple vector nodes, it
16504 // cannot be demoted.
16505 bool IsSignedNode = any_of(E.Scalars, [&](Value *R) {
16506 return !isKnownNonNegative(R, SimplifyQuery(*DL));
16507 });
16508 auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool {
16509 if (MultiNodeScalars.contains(V))
16510 return false;
16511 // For lat shuffle of sext/zext with many uses need to check the extra bit
16512 // for unsigned values, otherwise may have incorrect casting for reused
16513 // scalars.
16514 bool IsSignedVal = !isKnownNonNegative(V, SimplifyQuery(*DL));
16515 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth > BitWidth) {
16516 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
16517 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
16518 return true;
16519 }
16520 unsigned NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
16521 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
16522 if (IsSignedNode)
16523 ++BitWidth1;
16524 if (auto *I = dyn_cast<Instruction>(V)) {
16525 APInt Mask = DB->getDemandedBits(I);
16526 unsigned BitWidth2 =
16527 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
16528 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
16529 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth2 - 1);
16530 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
16531 break;
16532 BitWidth2 *= 2;
16533 }
16534 BitWidth1 = std::min(BitWidth1, BitWidth2);
16535 }
16536 BitWidth = std::max(BitWidth, BitWidth1);
16537 return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
16538 };
16539 using namespace std::placeholders;
16540 auto FinalAnalysis = [&]() {
16541 if (!IsProfitableToDemote)
16542 return false;
16543 bool Res = all_of(
16544 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(BitWidth)));
16545 // Demote gathers.
16546 if (Res && E.isGather()) {
16547 // Check possible extractelement instructions bases and final vector
16548 // length.
16549 SmallPtrSet<Value *, 4> UniqueBases;
16550 for (Value *V : E.Scalars) {
16551 auto *EE = dyn_cast<ExtractElementInst>(V);
16552 if (!EE)
16553 continue;
16554 UniqueBases.insert(EE->getVectorOperand());
16555 }
16556 const unsigned VF = E.Scalars.size();
16557 Type *OrigScalarTy = E.Scalars.front()->getType();
16558 if (UniqueBases.size() <= 2 ||
16559 TTI->getNumberOfParts(getWidenedType(OrigScalarTy, VF)) ==
16561 IntegerType::get(OrigScalarTy->getContext(), BitWidth), VF)))
16562 ToDemote.push_back(E.Idx);
16563 }
16564 return Res;
16565 };
16566 if (E.isGather() || !Visited.insert(&E).second ||
16567 any_of(E.Scalars, [&](Value *V) {
16568 return all_of(V->users(), [&](User *U) {
16569 return isa<InsertElementInst>(U) && !getTreeEntry(U);
16570 });
16571 }))
16572 return FinalAnalysis();
16573
16574 if (any_of(E.Scalars, [&](Value *V) {
16575 return !all_of(V->users(), [=](User *U) {
16576 return getTreeEntry(U) ||
16577 (E.Idx == 0 && UserIgnoreList &&
16578 UserIgnoreList->contains(U)) ||
16579 (!isa<CmpInst>(U) && U->getType()->isSized() &&
16580 !U->getType()->isScalableTy() &&
16581 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
16582 }) && !IsPotentiallyTruncated(V, BitWidth);
16583 }))
16584 return false;
16585
16586 auto ProcessOperands = [&](ArrayRef<const TreeEntry *> Operands,
16587 bool &NeedToExit) {
16588 NeedToExit = false;
16589 unsigned InitLevel = MaxDepthLevel;
16590 for (const TreeEntry *Op : Operands) {
16591 unsigned Level = InitLevel;
16592 if (!collectValuesToDemote(*Op, IsProfitableToDemoteRoot, BitWidth,
16593 ToDemote, Visited, Level, IsProfitableToDemote,
16594 IsTruncRoot)) {
16595 if (!IsProfitableToDemote)
16596 return false;
16597 NeedToExit = true;
16598 if (!FinalAnalysis())
16599 return false;
16600 continue;
16601 }
16602 MaxDepthLevel = std::max(MaxDepthLevel, Level);
16603 }
16604 return true;
16605 };
16606 auto AttemptCheckBitwidth =
16607 [&](function_ref<bool(unsigned, unsigned)> Checker, bool &NeedToExit) {
16608 // Try all bitwidth < OrigBitWidth.
16609 NeedToExit = false;
16610 unsigned BestFailBitwidth = 0;
16611 for (; BitWidth < OrigBitWidth; BitWidth *= 2) {
16612 if (Checker(BitWidth, OrigBitWidth))
16613 return true;
16614 if (BestFailBitwidth == 0 && FinalAnalysis())
16615 BestFailBitwidth = BitWidth;
16616 }
16617 if (BitWidth >= OrigBitWidth) {
16618 if (BestFailBitwidth == 0) {
16619 BitWidth = OrigBitWidth;
16620 return false;
16621 }
16622 MaxDepthLevel = 1;
16623 BitWidth = BestFailBitwidth;
16624 NeedToExit = true;
16625 return true;
16626 }
16627 return false;
16628 };
16629 auto TryProcessInstruction =
16630 [&](unsigned &BitWidth, ArrayRef<const TreeEntry *> Operands = {},
16631 function_ref<bool(unsigned, unsigned)> Checker = {}) {
16632 if (Operands.empty()) {
16633 if (!IsTruncRoot)
16634 MaxDepthLevel = 1;
16635 (void)for_each(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
16636 std::ref(BitWidth)));
16637 } else {
16638 // Several vectorized uses? Check if we can truncate it, otherwise -
16639 // exit.
16640 if (E.UserTreeIndices.size() > 1 &&
16641 !all_of(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
16642 std::ref(BitWidth))))
16643 return false;
16644 bool NeedToExit = false;
16645 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
16646 return false;
16647 if (NeedToExit)
16648 return true;
16649 if (!ProcessOperands(Operands, NeedToExit))
16650 return false;
16651 if (NeedToExit)
16652 return true;
16653 }
16654
16655 ++MaxDepthLevel;
16656 // Record the entry that we can demote.
16657 ToDemote.push_back(E.Idx);
16658 return IsProfitableToDemote;
16659 };
16660 switch (E.getOpcode()) {
16661
16662 // We can always demote truncations and extensions. Since truncations can
16663 // seed additional demotion, we save the truncated value.
16664 case Instruction::Trunc:
16665 if (IsProfitableToDemoteRoot)
16666 IsProfitableToDemote = true;
16667 return TryProcessInstruction(BitWidth);
16668 case Instruction::ZExt:
16669 case Instruction::SExt:
16670 IsProfitableToDemote = true;
16671 return TryProcessInstruction(BitWidth);
16672
16673 // We can demote certain binary operations if we can demote both of their
16674 // operands.
16675 case Instruction::Add:
16676 case Instruction::Sub:
16677 case Instruction::Mul:
16678 case Instruction::And:
16679 case Instruction::Or:
16680 case Instruction::Xor: {
16681 return TryProcessInstruction(
16682 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
16683 }
16684 case Instruction::Shl: {
16685 // If we are truncating the result of this SHL, and if it's a shift of an
16686 // inrange amount, we can always perform a SHL in a smaller type.
16687 auto ShlChecker = [&](unsigned BitWidth, unsigned) {
16688 return all_of(E.Scalars, [&](Value *V) {
16689 auto *I = cast<Instruction>(V);
16690 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
16691 return AmtKnownBits.getMaxValue().ult(BitWidth);
16692 });
16693 };
16694 return TryProcessInstruction(
16695 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
16696 }
16697 case Instruction::LShr: {
16698 // If this is a truncate of a logical shr, we can truncate it to a smaller
16699 // lshr iff we know that the bits we would otherwise be shifting in are
16700 // already zeros.
16701 auto LShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
16702 return all_of(E.Scalars, [&](Value *V) {
16703 auto *I = cast<Instruction>(V);
16704 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
16705 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
16706 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
16707 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
16708 SimplifyQuery(*DL));
16709 });
16710 };
16711 return TryProcessInstruction(
16712 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
16713 LShrChecker);
16714 }
16715 case Instruction::AShr: {
16716 // If this is a truncate of an arithmetic shr, we can truncate it to a
16717 // smaller ashr iff we know that all the bits from the sign bit of the
16718 // original type and the sign bit of the truncate type are similar.
16719 auto AShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
16720 return all_of(E.Scalars, [&](Value *V) {
16721 auto *I = cast<Instruction>(V);
16722 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
16723 unsigned ShiftedBits = OrigBitWidth - BitWidth;
16724 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
16725 ShiftedBits < ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
16726 nullptr, DT);
16727 });
16728 };
16729 return TryProcessInstruction(
16730 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
16731 AShrChecker);
16732 }
16733 case Instruction::UDiv:
16734 case Instruction::URem: {
16735 // UDiv and URem can be truncated if all the truncated bits are zero.
16736 auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
16737 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
16738 return all_of(E.Scalars, [&](Value *V) {
16739 auto *I = cast<Instruction>(V);
16740 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
16741 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
16742 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
16743 });
16744 };
16745 return TryProcessInstruction(
16746 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
16747 }
16748
16749 // We can demote selects if we can demote their true and false values.
16750 case Instruction::Select: {
16751 return TryProcessInstruction(
16752 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
16753 }
16754
16755 // We can demote phis if we can demote all their incoming operands. Note that
16756 // we don't need to worry about cycles since we ensure single use above.
16757 case Instruction::PHI: {
16758 const unsigned NumOps = E.getNumOperands();
16760 transform(seq<unsigned>(0, NumOps), Ops.begin(),
16761 std::bind(&BoUpSLP::getOperandEntry, this, &E, _1));
16762
16763 return TryProcessInstruction(BitWidth, Ops);
16764 }
16765
16766 case Instruction::Call: {
16767 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
16768 if (!IC)
16769 break;
16771 if (ID != Intrinsic::abs && ID != Intrinsic::smin &&
16772 ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax)
16773 break;
16774 SmallVector<const TreeEntry *, 2> Operands(1, getOperandEntry(&E, 0));
16775 function_ref<bool(unsigned, unsigned)> CallChecker;
16776 auto CompChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
16777 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
16778 return all_of(E.Scalars, [&](Value *V) {
16779 auto *I = cast<Instruction>(V);
16780 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
16781 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
16782 return MaskedValueIsZero(I->getOperand(0), Mask,
16783 SimplifyQuery(*DL)) &&
16784 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
16785 }
16786 assert((ID == Intrinsic::smin || ID == Intrinsic::smax) &&
16787 "Expected min/max intrinsics only.");
16788 unsigned SignBits = OrigBitWidth - BitWidth;
16789 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
16790 unsigned Op0SignBits = ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
16791 nullptr, DT);
16792 unsigned Op1SignBits = ComputeNumSignBits(I->getOperand(1), *DL, 0, AC,
16793 nullptr, DT);
16794 return SignBits <= Op0SignBits &&
16795 ((SignBits != Op0SignBits &&
16796 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
16797 MaskedValueIsZero(I->getOperand(0), Mask,
16798 SimplifyQuery(*DL))) &&
16799 SignBits <= Op1SignBits &&
16800 ((SignBits != Op1SignBits &&
16801 !isKnownNonNegative(I->getOperand(1), SimplifyQuery(*DL))) ||
16802 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL)));
16803 });
16804 };
16805 if (ID != Intrinsic::abs) {
16806 Operands.push_back(getOperandEntry(&E, 1));
16807 CallChecker = CompChecker;
16808 }
16809 InstructionCost BestCost =
16810 std::numeric_limits<InstructionCost::CostType>::max();
16811 unsigned BestBitWidth = BitWidth;
16812 unsigned VF = E.Scalars.size();
16813 // Choose the best bitwidth based on cost estimations.
16814 auto Checker = [&](unsigned BitWidth, unsigned) {
16815 unsigned MinBW = PowerOf2Ceil(BitWidth);
16816 SmallVector<Type *> ArgTys = buildIntrinsicArgTypes(IC, ID, VF, MinBW);
16817 auto VecCallCosts = getVectorCallCosts(
16818 IC, getWidenedType(IntegerType::get(IC->getContext(), MinBW), VF),
16819 TTI, TLI, ArgTys);
16820 InstructionCost Cost = std::min(VecCallCosts.first, VecCallCosts.second);
16821 if (Cost < BestCost) {
16822 BestCost = Cost;
16823 BestBitWidth = BitWidth;
16824 }
16825 return false;
16826 };
16827 [[maybe_unused]] bool NeedToExit;
16828 (void)AttemptCheckBitwidth(Checker, NeedToExit);
16829 BitWidth = BestBitWidth;
16830 return TryProcessInstruction(BitWidth, Operands, CallChecker);
16831 }
16832
16833 // Otherwise, conservatively give up.
16834 default:
16835 break;
16836 }
16837 MaxDepthLevel = 1;
16838 return FinalAnalysis();
16839}
16840
16841static RecurKind getRdxKind(Value *V);
16842
16844 // We only attempt to truncate integer expressions.
16845 bool IsStoreOrInsertElt =
16846 VectorizableTree.front()->getOpcode() == Instruction::Store ||
16847 VectorizableTree.front()->getOpcode() == Instruction::InsertElement;
16848 if ((IsStoreOrInsertElt || UserIgnoreList) &&
16849 ExtraBitWidthNodes.size() <= 1 &&
16850 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
16851 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
16852 return;
16853
16854 unsigned NodeIdx = 0;
16855 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
16856 NodeIdx = 1;
16857
16858 // Ensure the roots of the vectorizable tree don't form a cycle.
16859 if (VectorizableTree[NodeIdx]->isGather() ||
16860 (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.empty()) ||
16861 (NodeIdx != 0 && any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
16862 [NodeIdx](const EdgeInfo &EI) {
16863 return EI.UserTE->Idx >
16864 static_cast<int>(NodeIdx);
16865 })))
16866 return;
16867
16868 // The first value node for store/insertelement is sext/zext/trunc? Skip it,
16869 // resize to the final type.
16870 bool IsTruncRoot = false;
16871 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
16872 SmallVector<unsigned> RootDemotes;
16873 if (NodeIdx != 0 &&
16874 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
16875 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
16876 assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
16877 IsTruncRoot = true;
16878 RootDemotes.push_back(NodeIdx);
16879 IsProfitableToDemoteRoot = true;
16880 ++NodeIdx;
16881 }
16882
16883 // Analyzed the reduction already and not profitable - exit.
16884 if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
16885 return;
16886
16887 SmallVector<unsigned> ToDemote;
16888 auto ComputeMaxBitWidth = [&](const TreeEntry &E, bool IsTopRoot,
16889 bool IsProfitableToDemoteRoot, unsigned Opcode,
16890 unsigned Limit, bool IsTruncRoot,
16891 bool IsSignedCmp) -> unsigned {
16892 ToDemote.clear();
16893 // Check if the root is trunc and the next node is gather/buildvector, then
16894 // keep trunc in scalars, which is free in most cases.
16895 if (E.isGather() && IsTruncRoot && E.UserTreeIndices.size() == 1 &&
16896 E.Idx > (IsStoreOrInsertElt ? 2 : 1) &&
16897 all_of(E.Scalars, [&](Value *V) {
16898 return V->hasOneUse() || isa<Constant>(V) ||
16899 (!V->hasNUsesOrMore(UsesLimit) &&
16900 none_of(V->users(), [&](User *U) {
16901 const TreeEntry *TE = getTreeEntry(U);
16902 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
16903 if (TE == UserTE || !TE)
16904 return false;
16905 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
16906 SelectInst>(U) ||
16907 !isa<CastInst, BinaryOperator, FreezeInst, PHINode,
16908 SelectInst>(UserTE->getMainOp()))
16909 return true;
16910 unsigned UserTESz = DL->getTypeSizeInBits(
16911 UserTE->Scalars.front()->getType());
16912 auto It = MinBWs.find(TE);
16913 if (It != MinBWs.end() && It->second.first > UserTESz)
16914 return true;
16915 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
16916 }));
16917 })) {
16918 ToDemote.push_back(E.Idx);
16919 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
16920 auto It = MinBWs.find(UserTE);
16921 if (It != MinBWs.end())
16922 return It->second.first;
16923 unsigned MaxBitWidth =
16924 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
16925 MaxBitWidth = bit_ceil(MaxBitWidth);
16926 if (MaxBitWidth < 8 && MaxBitWidth > 1)
16927 MaxBitWidth = 8;
16928 return MaxBitWidth;
16929 }
16930
16931 unsigned VF = E.getVectorFactor();
16932 Type *ScalarTy = E.Scalars.front()->getType();
16933 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
16934 auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->getScalarType());
16935 if (!TreeRootIT || !Opcode)
16936 return 0u;
16937
16938 if (any_of(E.Scalars,
16939 [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
16940 return 0u;
16941
16942 unsigned NumParts = TTI->getNumberOfParts(
16943 getWidenedType(TreeRootIT, VF * ScalarTyNumElements));
16944
16945 // The maximum bit width required to represent all the values that can be
16946 // demoted without loss of precision. It would be safe to truncate the roots
16947 // of the expression to this width.
16948 unsigned MaxBitWidth = 1u;
16949
16950 // True if the roots can be zero-extended back to their original type,
16951 // rather than sign-extended. We know that if the leading bits are not
16952 // demanded, we can safely zero-extend. So we initialize IsKnownPositive to
16953 // True.
16954 // Determine if the sign bit of all the roots is known to be zero. If not,
16955 // IsKnownPositive is set to False.
16956 bool IsKnownPositive = !IsSignedCmp && all_of(E.Scalars, [&](Value *R) {
16957 KnownBits Known = computeKnownBits(R, *DL);
16958 return Known.isNonNegative();
16959 });
16960
16961 // We first check if all the bits of the roots are demanded. If they're not,
16962 // we can truncate the roots to this narrower type.
16963 for (Value *Root : E.Scalars) {
16964 unsigned NumSignBits = ComputeNumSignBits(Root, *DL, 0, AC, nullptr, DT);
16965 TypeSize NumTypeBits =
16966 DL->getTypeSizeInBits(Root->getType()->getScalarType());
16967 unsigned BitWidth1 = NumTypeBits - NumSignBits;
16968 // If we can't prove that the sign bit is zero, we must add one to the
16969 // maximum bit width to account for the unknown sign bit. This preserves
16970 // the existing sign bit so we can safely sign-extend the root back to the
16971 // original type. Otherwise, if we know the sign bit is zero, we will
16972 // zero-extend the root instead.
16973 //
16974 // FIXME: This is somewhat suboptimal, as there will be cases where adding
16975 // one to the maximum bit width will yield a larger-than-necessary
16976 // type. In general, we need to add an extra bit only if we can't
16977 // prove that the upper bit of the original type is equal to the
16978 // upper bit of the proposed smaller type. If these two bits are
16979 // the same (either zero or one) we know that sign-extending from
16980 // the smaller type will result in the same value. Here, since we
16981 // can't yet prove this, we are just making the proposed smaller
16982 // type larger to ensure correctness.
16983 if (!IsKnownPositive)
16984 ++BitWidth1;
16985
16986 APInt Mask = DB->getDemandedBits(cast<Instruction>(Root));
16987 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
16988 MaxBitWidth =
16989 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
16990 }
16991
16992 if (MaxBitWidth < 8 && MaxBitWidth > 1)
16993 MaxBitWidth = 8;
16994
16995 // If the original type is large, but reduced type does not improve the reg
16996 // use - ignore it.
16997 if (NumParts > 1 &&
16998 NumParts ==
17000 IntegerType::get(F->getContext(), bit_ceil(MaxBitWidth)), VF)))
17001 return 0u;
17002
17003 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
17004 Opcode == Instruction::SExt ||
17005 Opcode == Instruction::ZExt || NumParts > 1;
17006 // Conservatively determine if we can actually truncate the roots of the
17007 // expression. Collect the values that can be demoted in ToDemote and
17008 // additional roots that require investigating in Roots.
17010 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
17011 bool NeedToDemote = IsProfitableToDemote;
17012
17013 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
17014 ToDemote, Visited, MaxDepthLevel, NeedToDemote,
17015 IsTruncRoot) ||
17016 (MaxDepthLevel <= Limit &&
17017 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
17018 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
17019 DL->getTypeSizeInBits(TreeRootIT) /
17020 DL->getTypeSizeInBits(cast<Instruction>(E.Scalars.front())
17021 ->getOperand(0)
17022 ->getType()) >
17023 2)))))
17024 return 0u;
17025 // Round MaxBitWidth up to the next power-of-two.
17026 MaxBitWidth = bit_ceil(MaxBitWidth);
17027
17028 return MaxBitWidth;
17029 };
17030
17031 // If we can truncate the root, we must collect additional values that might
17032 // be demoted as a result. That is, those seeded by truncations we will
17033 // modify.
17034 // Add reduction ops sizes, if any.
17035 if (UserIgnoreList &&
17036 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
17037 for (Value *V : *UserIgnoreList) {
17038 auto NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
17039 auto NumTypeBits = DL->getTypeSizeInBits(V->getType());
17040 unsigned BitWidth1 = NumTypeBits - NumSignBits;
17041 if (!isKnownNonNegative(V, SimplifyQuery(*DL)))
17042 ++BitWidth1;
17043 unsigned BitWidth2 = BitWidth1;
17045 auto Mask = DB->getDemandedBits(cast<Instruction>(V));
17046 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
17047 }
17048 ReductionBitWidth =
17049 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
17050 }
17051 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
17052 ReductionBitWidth = 8;
17053
17054 ReductionBitWidth = bit_ceil(ReductionBitWidth);
17055 }
17056 bool IsTopRoot = NodeIdx == 0;
17057 while (NodeIdx < VectorizableTree.size() &&
17058 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
17059 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
17060 RootDemotes.push_back(NodeIdx);
17061 ++NodeIdx;
17062 IsTruncRoot = true;
17063 }
17064 bool IsSignedCmp = false;
17065 while (NodeIdx < VectorizableTree.size()) {
17066 ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
17067 unsigned Limit = 2;
17068 unsigned Opcode = VectorizableTree[NodeIdx]->getOpcode();
17069 if (IsTopRoot &&
17070 ReductionBitWidth ==
17071 DL->getTypeSizeInBits(
17072 VectorizableTree.front()->Scalars.front()->getType()))
17073 Limit = 3;
17074 unsigned MaxBitWidth = ComputeMaxBitWidth(
17075 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Opcode,
17076 Limit, IsTruncRoot, IsSignedCmp);
17077 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
17078 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
17079 ReductionBitWidth = bit_ceil(MaxBitWidth);
17080 else if (MaxBitWidth == 0)
17081 ReductionBitWidth = 0;
17082 }
17083
17084 for (unsigned Idx : RootDemotes) {
17085 if (all_of(VectorizableTree[Idx]->Scalars, [&](Value *V) {
17086 uint32_t OrigBitWidth =
17087 DL->getTypeSizeInBits(V->getType()->getScalarType());
17088 if (OrigBitWidth > MaxBitWidth) {
17089 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, MaxBitWidth);
17090 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
17091 }
17092 return false;
17093 }))
17094 ToDemote.push_back(Idx);
17095 }
17096 RootDemotes.clear();
17097 IsTopRoot = false;
17098 IsProfitableToDemoteRoot = true;
17099
17100 if (ExtraBitWidthNodes.empty()) {
17101 NodeIdx = VectorizableTree.size();
17102 } else {
17103 unsigned NewIdx = 0;
17104 do {
17105 NewIdx = *ExtraBitWidthNodes.begin();
17106 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
17107 } while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
17108 NodeIdx = NewIdx;
17109 IsTruncRoot =
17110 NodeIdx < VectorizableTree.size() &&
17111 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
17112 [](const EdgeInfo &EI) {
17113 return EI.EdgeIdx == 0 &&
17114 EI.UserTE->getOpcode() == Instruction::Trunc &&
17115 !EI.UserTE->isAltShuffle();
17116 });
17117 IsSignedCmp =
17118 NodeIdx < VectorizableTree.size() &&
17119 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
17120 [&](const EdgeInfo &EI) {
17121 return EI.UserTE->getOpcode() == Instruction::ICmp &&
17122 any_of(EI.UserTE->Scalars, [&](Value *V) {
17123 auto *IC = dyn_cast<ICmpInst>(V);
17124 return IC &&
17125 (IC->isSigned() ||
17126 !isKnownNonNegative(IC->getOperand(0),
17127 SimplifyQuery(*DL)) ||
17128 !isKnownNonNegative(IC->getOperand(1),
17129 SimplifyQuery(*DL)));
17130 });
17131 });
17132 }
17133
17134 // If the maximum bit width we compute is less than the with of the roots'
17135 // type, we can proceed with the narrowing. Otherwise, do nothing.
17136 if (MaxBitWidth == 0 ||
17137 MaxBitWidth >=
17138 cast<IntegerType>(TreeRoot.front()->getType()->getScalarType())
17139 ->getBitWidth()) {
17140 if (UserIgnoreList)
17141 AnalyzedMinBWVals.insert(TreeRoot.begin(), TreeRoot.end());
17142 continue;
17143 }
17144
17145 // Finally, map the values we can demote to the maximum bit with we
17146 // computed.
17147 for (unsigned Idx : ToDemote) {
17148 TreeEntry *TE = VectorizableTree[Idx].get();
17149 if (MinBWs.contains(TE))
17150 continue;
17151 bool IsSigned = any_of(TE->Scalars, [&](Value *R) {
17152 return !isKnownNonNegative(R, SimplifyQuery(*DL));
17153 });
17154 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
17155 }
17156 }
17157}
17158
17160 auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
17161 auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
17162 auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
17163 auto *AA = &AM.getResult<AAManager>(F);
17164 auto *LI = &AM.getResult<LoopAnalysis>(F);
17165 auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
17166 auto *AC = &AM.getResult<AssumptionAnalysis>(F);
17167 auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
17169
17170 bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
17171 if (!Changed)
17172 return PreservedAnalyses::all();
17173
17176 return PA;
17177}
17178
17180 TargetTransformInfo *TTI_,
17181 TargetLibraryInfo *TLI_, AAResults *AA_,
17182 LoopInfo *LI_, DominatorTree *DT_,
17183 AssumptionCache *AC_, DemandedBits *DB_,
17186 return false;
17187 SE = SE_;
17188 TTI = TTI_;
17189 TLI = TLI_;
17190 AA = AA_;
17191 LI = LI_;
17192 DT = DT_;
17193 AC = AC_;
17194 DB = DB_;
17195 DL = &F.getDataLayout();
17196
17197 Stores.clear();
17198 GEPs.clear();
17199 bool Changed = false;
17200
17201 // If the target claims to have no vector registers don't attempt
17202 // vectorization.
17204 LLVM_DEBUG(
17205 dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");
17206 return false;
17207 }
17208
17209 // Don't vectorize when the attribute NoImplicitFloat is used.
17210 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
17211 return false;
17212
17213 LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
17214
17215 // Use the bottom up slp vectorizer to construct chains that start with
17216 // store instructions.
17217 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
17218
17219 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
17220 // delete instructions.
17221
17222 // Update DFS numbers now so that we can use them for ordering.
17223 DT->updateDFSNumbers();
17224
17225 // Scan the blocks in the function in post order.
17226 for (auto *BB : post_order(&F.getEntryBlock())) {
17227 // Start new block - clear the list of reduction roots.
17228 R.clearReductionData();
17229 collectSeedInstructions(BB);
17230
17231 // Vectorize trees that end at stores.
17232 if (!Stores.empty()) {
17233 LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
17234 << " underlying objects.\n");
17235 Changed |= vectorizeStoreChains(R);
17236 }
17237
17238 // Vectorize trees that end at reductions.
17239 Changed |= vectorizeChainsInBlock(BB, R);
17240
17241 // Vectorize the index computations of getelementptr instructions. This
17242 // is primarily intended to catch gather-like idioms ending at
17243 // non-consecutive loads.
17244 if (!GEPs.empty()) {
17245 LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
17246 << " underlying objects.\n");
17247 Changed |= vectorizeGEPIndices(BB, R);
17248 }
17249 }
17250
17251 if (Changed) {
17252 R.optimizeGatherSequence();
17253 LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
17254 }
17255 return Changed;
17256}
17257
17258std::optional<bool>
17259SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
17260 unsigned Idx, unsigned MinVF,
17261 unsigned &Size) {
17262 Size = 0;
17263 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
17264 << "\n");
17265 const unsigned Sz = R.getVectorElementSize(Chain[0]);
17266 unsigned VF = Chain.size();
17267
17268 if (!has_single_bit(Sz) || !has_single_bit(VF) || VF < 2 || VF < MinVF) {
17269 // Check if vectorizing with a non-power-of-2 VF should be considered. At
17270 // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
17271 // all vector lanes are used.
17272 if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
17273 return false;
17274 }
17275
17276 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
17277 << "\n");
17278
17279 SetVector<Value *> ValOps;
17280 for (Value *V : Chain)
17281 ValOps.insert(cast<StoreInst>(V)->getValueOperand());
17282 // Operands are not same/alt opcodes or non-power-of-2 uniques - exit.
17283 InstructionsState S = getSameOpcode(ValOps.getArrayRef(), *TLI);
17284 if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) {
17285 DenseSet<Value *> Stores(Chain.begin(), Chain.end());
17286 bool IsPowerOf2 =
17287 has_single_bit(ValOps.size()) ||
17288 (VectorizeNonPowerOf2 && has_single_bit(ValOps.size() + 1));
17289 if ((!IsPowerOf2 && S.getOpcode() && S.getOpcode() != Instruction::Load &&
17290 (!S.MainOp->isSafeToRemove() ||
17291 any_of(ValOps.getArrayRef(),
17292 [&](Value *V) {
17293 return !isa<ExtractElementInst>(V) &&
17294 (V->getNumUses() > Chain.size() ||
17295 any_of(V->users(), [&](User *U) {
17296 return !Stores.contains(U);
17297 }));
17298 }))) ||
17299 (ValOps.size() > Chain.size() / 2 && !S.getOpcode())) {
17300 Size = (!IsPowerOf2 && S.getOpcode()) ? 1 : 2;
17301 return false;
17302 }
17303 }
17304 if (R.isLoadCombineCandidate(Chain))
17305 return true;
17306 R.buildTree(Chain);
17307 // Check if tree tiny and store itself or its value is not vectorized.
17308 if (R.isTreeTinyAndNotFullyVectorizable()) {
17309 if (R.isGathered(Chain.front()) ||
17310 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
17311 return std::nullopt;
17312 Size = R.getCanonicalGraphSize();
17313 return false;
17314 }
17315 R.reorderTopToBottom();
17316 R.reorderBottomToTop();
17317 R.transformNodes();
17318 R.buildExternalUses();
17319
17320 R.computeMinimumValueSizes();
17321
17322 Size = R.getCanonicalGraphSize();
17323 if (S.getOpcode() == Instruction::Load)
17324 Size = 2; // cut off masked gather small trees
17325 InstructionCost Cost = R.getTreeCost();
17326
17327 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
17328 if (Cost < -SLPCostThreshold) {
17329 LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
17330
17331 using namespace ore;
17332
17333 R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
17334 cast<StoreInst>(Chain[0]))
17335 << "Stores SLP vectorized with cost " << NV("Cost", Cost)
17336 << " and with tree size "
17337 << NV("TreeSize", R.getTreeSize()));
17338
17339 R.vectorizeTree();
17340 return true;
17341 }
17342
17343 return false;
17344}
17345
17346/// Checks if the quadratic mean deviation is less than 90% of the mean size.
17347static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes,
17348 bool First) {
17349 unsigned Num = 0;
17350 uint64_t Sum = std::accumulate(
17351 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
17352 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
17353 unsigned Size = First ? Val.first : Val.second;
17354 if (Size == 1)
17355 return V;
17356 ++Num;
17357 return V + Size;
17358 });
17359 if (Num == 0)
17360 return true;
17361 uint64_t Mean = Sum / Num;
17362 if (Mean == 0)
17363 return true;
17364 uint64_t Dev = std::accumulate(
17365 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
17366 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
17367 unsigned P = First ? Val.first : Val.second;
17368 if (P == 1)
17369 return V;
17370 return V + (P - Mean) * (P - Mean);
17371 }) /
17372 Num;
17373 return Dev * 81 / (Mean * Mean) == 0;
17374}
17375
17376bool SLPVectorizerPass::vectorizeStores(
17377 ArrayRef<StoreInst *> Stores, BoUpSLP &R,
17378 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
17379 &Visited) {
17380 // We may run into multiple chains that merge into a single chain. We mark the
17381 // stores that we vectorized so that we don't visit the same store twice.
17382 BoUpSLP::ValueSet VectorizedStores;
17383 bool Changed = false;
17384
17385 struct StoreDistCompare {
17386 bool operator()(const std::pair<unsigned, int> &Op1,
17387 const std::pair<unsigned, int> &Op2) const {
17388 return Op1.second < Op2.second;
17389 }
17390 };
17391 // A set of pairs (index of store in Stores array ref, Distance of the store
17392 // address relative to base store address in units).
17393 using StoreIndexToDistSet =
17394 std::set<std::pair<unsigned, int>, StoreDistCompare>;
17395 auto TryToVectorize = [&](const StoreIndexToDistSet &Set) {
17396 int PrevDist = -1;
17398 // Collect the chain into a list.
17399 for (auto [Idx, Data] : enumerate(Set)) {
17400 if (Operands.empty() || Data.second - PrevDist == 1) {
17401 Operands.push_back(Stores[Data.first]);
17402 PrevDist = Data.second;
17403 if (Idx != Set.size() - 1)
17404 continue;
17405 }
17406 auto E = make_scope_exit([&, &DataVar = Data]() {
17407 Operands.clear();
17408 Operands.push_back(Stores[DataVar.first]);
17409 PrevDist = DataVar.second;
17410 });
17411
17412 if (Operands.size() <= 1 ||
17413 !Visited
17414 .insert({Operands.front(),
17415 cast<StoreInst>(Operands.front())->getValueOperand(),
17416 Operands.back(),
17417 cast<StoreInst>(Operands.back())->getValueOperand(),
17418 Operands.size()})
17419 .second)
17420 continue;
17421
17422 unsigned MaxVecRegSize = R.getMaxVecRegSize();
17423 unsigned EltSize = R.getVectorElementSize(Operands[0]);
17424 unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
17425
17426 unsigned MaxVF =
17427 std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
17428 unsigned MaxRegVF = MaxVF;
17429 auto *Store = cast<StoreInst>(Operands[0]);
17430 Type *StoreTy = Store->getValueOperand()->getType();
17431 Type *ValueTy = StoreTy;
17432 if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
17433 ValueTy = Trunc->getSrcTy();
17434 if (ValueTy == StoreTy &&
17435 R.getVectorElementSize(Store->getValueOperand()) <= EltSize)
17436 MaxVF = std::min<unsigned>(MaxVF, bit_floor(Operands.size()));
17437 unsigned MinVF = std::max<unsigned>(
17439 R.getMinVF(DL->getTypeStoreSizeInBits(StoreTy)), StoreTy,
17440 ValueTy)));
17441
17442 if (MaxVF < MinVF) {
17443 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
17444 << ") < "
17445 << "MinVF (" << MinVF << ")\n");
17446 continue;
17447 }
17448
17449 unsigned NonPowerOf2VF = 0;
17451 // First try vectorizing with a non-power-of-2 VF. At the moment, only
17452 // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
17453 // lanes are used.
17454 unsigned CandVF =
17455 std::clamp<unsigned>(Operands.size(), MaxVF, MaxRegVF);
17456 if (has_single_bit(CandVF + 1))
17457 NonPowerOf2VF = CandVF;
17458 }
17459
17460 unsigned Sz = 1 + Log2_32(MaxVF) - Log2_32(MinVF);
17461 SmallVector<unsigned> CandidateVFs(Sz + (NonPowerOf2VF > 0 ? 1 : 0));
17462 unsigned Size = MinVF;
17463 for_each(reverse(CandidateVFs), [&](unsigned &VF) {
17464 VF = Size > MaxVF ? NonPowerOf2VF : Size;
17465 Size *= 2;
17466 });
17467 unsigned End = Operands.size();
17468 unsigned Repeat = 0;
17469 constexpr unsigned MaxAttempts = 4;
17471 for_each(RangeSizes, [](std::pair<unsigned, unsigned> &P) {
17472 P.first = P.second = 1;
17473 });
17475 auto IsNotVectorized = [](bool First,
17476 const std::pair<unsigned, unsigned> &P) {
17477 return First ? P.first > 0 : P.second > 0;
17478 };
17479 auto IsVectorized = [](bool First,
17480 const std::pair<unsigned, unsigned> &P) {
17481 return First ? P.first == 0 : P.second == 0;
17482 };
17483 auto VFIsProfitable = [](bool First, unsigned Size,
17484 const std::pair<unsigned, unsigned> &P) {
17485 return First ? Size >= P.first : Size >= P.second;
17486 };
17487 auto FirstSizeSame = [](unsigned Size,
17488 const std::pair<unsigned, unsigned> &P) {
17489 return Size == P.first;
17490 };
17491 while (true) {
17492 ++Repeat;
17493 bool RepeatChanged = false;
17494 bool AnyProfitableGraph = false;
17495 for (unsigned Size : CandidateVFs) {
17496 AnyProfitableGraph = false;
17497 unsigned StartIdx = std::distance(
17498 RangeSizes.begin(),
17499 find_if(RangeSizes, std::bind(IsNotVectorized, Size >= MaxRegVF,
17500 std::placeholders::_1)));
17501 while (StartIdx < End) {
17502 unsigned EndIdx =
17503 std::distance(RangeSizes.begin(),
17504 find_if(RangeSizes.drop_front(StartIdx),
17505 std::bind(IsVectorized, Size >= MaxRegVF,
17506 std::placeholders::_1)));
17507 unsigned Sz = EndIdx >= End ? End : EndIdx;
17508 for (unsigned Cnt = StartIdx; Cnt + Size <= Sz;) {
17509 if (!checkTreeSizes(RangeSizes.slice(Cnt, Size),
17510 Size >= MaxRegVF)) {
17511 ++Cnt;
17512 continue;
17513 }
17515 assert(all_of(Slice,
17516 [&](Value *V) {
17517 return cast<StoreInst>(V)
17518 ->getValueOperand()
17519 ->getType() ==
17520 cast<StoreInst>(Slice.front())
17521 ->getValueOperand()
17522 ->getType();
17523 }) &&
17524 "Expected all operands of same type.");
17525 if (!NonSchedulable.empty()) {
17526 auto [NonSchedSizeMax, NonSchedSizeMin] =
17527 NonSchedulable.lookup(Slice.front());
17528 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= Size) {
17529 Cnt += NonSchedSizeMax;
17530 continue;
17531 }
17532 }
17533 unsigned TreeSize;
17534 std::optional<bool> Res =
17535 vectorizeStoreChain(Slice, R, Cnt, MinVF, TreeSize);
17536 if (!Res) {
17537 NonSchedulable
17538 .try_emplace(Slice.front(), std::make_pair(Size, Size))
17539 .first->getSecond()
17540 .second = Size;
17541 } else if (*Res) {
17542 // Mark the vectorized stores so that we don't vectorize them
17543 // again.
17544 VectorizedStores.insert(Slice.begin(), Slice.end());
17545 // Mark the vectorized stores so that we don't vectorize them
17546 // again.
17547 AnyProfitableGraph = RepeatChanged = Changed = true;
17548 // If we vectorized initial block, no need to try to vectorize
17549 // it again.
17550 for_each(RangeSizes.slice(Cnt, Size),
17551 [](std::pair<unsigned, unsigned> &P) {
17552 P.first = P.second = 0;
17553 });
17554 if (Cnt < StartIdx + MinVF) {
17555 for_each(RangeSizes.slice(StartIdx, Cnt - StartIdx),
17556 [](std::pair<unsigned, unsigned> &P) {
17557 P.first = P.second = 0;
17558 });
17559 StartIdx = Cnt + Size;
17560 }
17561 if (Cnt > Sz - Size - MinVF) {
17562 for_each(RangeSizes.slice(Cnt + Size, Sz - (Cnt + Size)),
17563 [](std::pair<unsigned, unsigned> &P) {
17564 P.first = P.second = 0;
17565 });
17566 if (Sz == End)
17567 End = Cnt;
17568 Sz = Cnt;
17569 }
17570 Cnt += Size;
17571 continue;
17572 }
17573 if (Size > 2 && Res &&
17574 !all_of(RangeSizes.slice(Cnt, Size),
17575 std::bind(VFIsProfitable, Size >= MaxRegVF, TreeSize,
17576 std::placeholders::_1))) {
17577 Cnt += Size;
17578 continue;
17579 }
17580 // Check for the very big VFs that we're not rebuilding same
17581 // trees, just with larger number of elements.
17582 if (Size > MaxRegVF && TreeSize > 1 &&
17583 all_of(RangeSizes.slice(Cnt, Size),
17584 std::bind(FirstSizeSame, TreeSize,
17585 std::placeholders::_1))) {
17586 Cnt += Size;
17587 while (Cnt != Sz && RangeSizes[Cnt].first == TreeSize)
17588 ++Cnt;
17589 continue;
17590 }
17591 if (TreeSize > 1)
17592 for_each(RangeSizes.slice(Cnt, Size),
17593 [&](std::pair<unsigned, unsigned> &P) {
17594 if (Size >= MaxRegVF)
17595 P.second = std::max(P.second, TreeSize);
17596 else
17597 P.first = std::max(P.first, TreeSize);
17598 });
17599 ++Cnt;
17600 AnyProfitableGraph = true;
17601 }
17602 if (StartIdx >= End)
17603 break;
17604 if (Sz - StartIdx < Size && Sz - StartIdx >= MinVF)
17605 AnyProfitableGraph = true;
17606 StartIdx = std::distance(
17607 RangeSizes.begin(),
17608 find_if(RangeSizes.drop_front(Sz),
17609 std::bind(IsNotVectorized, Size >= MaxRegVF,
17610 std::placeholders::_1)));
17611 }
17612 if (!AnyProfitableGraph && Size >= MaxRegVF)
17613 break;
17614 }
17615 // All values vectorized - exit.
17616 if (all_of(RangeSizes, [](const std::pair<unsigned, unsigned> &P) {
17617 return P.first == 0 && P.second == 0;
17618 }))
17619 break;
17620 // Check if tried all attempts or no need for the last attempts at all.
17621 if (Repeat >= MaxAttempts ||
17622 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
17623 break;
17624 constexpr unsigned StoresLimit = 64;
17625 const unsigned MaxTotalNum = bit_floor(std::min<unsigned>(
17626 Operands.size(),
17627 static_cast<unsigned>(
17628 End -
17629 std::distance(
17630 RangeSizes.begin(),
17631 find_if(RangeSizes, std::bind(IsNotVectorized, true,
17632 std::placeholders::_1))) +
17633 1)));
17634 unsigned VF = PowerOf2Ceil(CandidateVFs.front()) * 2;
17635 if (VF > MaxTotalNum || VF >= StoresLimit)
17636 break;
17637 for_each(RangeSizes, [&](std::pair<unsigned, unsigned> &P) {
17638 if (P.first != 0)
17639 P.first = std::max(P.second, P.first);
17640 });
17641 // Last attempt to vectorize max number of elements, if all previous
17642 // attempts were unsuccessful because of the cost issues.
17643 CandidateVFs.clear();
17644 CandidateVFs.push_back(VF);
17645 }
17646 }
17647 };
17648
17649 // Stores pair (first: index of the store into Stores array ref, address of
17650 // which taken as base, second: sorted set of pairs {index, dist}, which are
17651 // indices of stores in the set and their store location distances relative to
17652 // the base address).
17653
17654 // Need to store the index of the very first store separately, since the set
17655 // may be reordered after the insertion and the first store may be moved. This
17656 // container allows to reduce number of calls of getPointersDiff() function.
17658 // Inserts the specified store SI with the given index Idx to the set of the
17659 // stores. If the store with the same distance is found already - stop
17660 // insertion, try to vectorize already found stores. If some stores from this
17661 // sequence were not vectorized - try to vectorize them with the new store
17662 // later. But this logic is applied only to the stores, that come before the
17663 // previous store with the same distance.
17664 // Example:
17665 // 1. store x, %p
17666 // 2. store y, %p+1
17667 // 3. store z, %p+2
17668 // 4. store a, %p
17669 // 5. store b, %p+3
17670 // - Scan this from the last to first store. The very first bunch of stores is
17671 // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
17672 // vector).
17673 // - The next store in the list - #1 - has the same distance from store #5 as
17674 // the store #4.
17675 // - Try to vectorize sequence of stores 4,2,3,5.
17676 // - If all these stores are vectorized - just drop them.
17677 // - If some of them are not vectorized (say, #3 and #5), do extra analysis.
17678 // - Start new stores sequence.
17679 // The new bunch of stores is {1, {1, 0}}.
17680 // - Add the stores from previous sequence, that were not vectorized.
17681 // Here we consider the stores in the reversed order, rather they are used in
17682 // the IR (Stores are reversed already, see vectorizeStoreChains() function).
17683 // Store #3 can be added -> comes after store #4 with the same distance as
17684 // store #1.
17685 // Store #5 cannot be added - comes before store #4.
17686 // This logic allows to improve the compile time, we assume that the stores
17687 // after previous store with the same distance most likely have memory
17688 // dependencies and no need to waste compile time to try to vectorize them.
17689 // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
17690 auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
17691 for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {
17692 std::optional<int> Diff = getPointersDiff(
17693 Stores[Set.first]->getValueOperand()->getType(),
17694 Stores[Set.first]->getPointerOperand(),
17695 SI->getValueOperand()->getType(), SI->getPointerOperand(), *DL, *SE,
17696 /*StrictCheck=*/true);
17697 if (!Diff)
17698 continue;
17699 auto It = Set.second.find(std::make_pair(Idx, *Diff));
17700 if (It == Set.second.end()) {
17701 Set.second.emplace(Idx, *Diff);
17702 return;
17703 }
17704 // Try to vectorize the first found set to avoid duplicate analysis.
17705 TryToVectorize(Set.second);
17706 StoreIndexToDistSet PrevSet;
17707 PrevSet.swap(Set.second);
17708 Set.first = Idx;
17709 Set.second.emplace(Idx, 0);
17710 // Insert stores that followed previous match to try to vectorize them
17711 // with this store.
17712 unsigned StartIdx = It->first + 1;
17713 SmallBitVector UsedStores(Idx - StartIdx);
17714 // Distances to previously found dup store (or this store, since they
17715 // store to the same addresses).
17716 SmallVector<int> Dists(Idx - StartIdx, 0);
17717 for (const std::pair<unsigned, int> &Pair : reverse(PrevSet)) {
17718 // Do not try to vectorize sequences, we already tried.
17719 if (Pair.first <= It->first ||
17720 VectorizedStores.contains(Stores[Pair.first]))
17721 break;
17722 unsigned BI = Pair.first - StartIdx;
17723 UsedStores.set(BI);
17724 Dists[BI] = Pair.second - It->second;
17725 }
17726 for (unsigned I = StartIdx; I < Idx; ++I) {
17727 unsigned BI = I - StartIdx;
17728 if (UsedStores.test(BI))
17729 Set.second.emplace(I, Dists[BI]);
17730 }
17731 return;
17732 }
17733 auto &Res = SortedStores.emplace_back();
17734 Res.first = Idx;
17735 Res.second.emplace(Idx, 0);
17736 };
17737 Type *PrevValTy = nullptr;
17738 for (auto [I, SI] : enumerate(Stores)) {
17739 if (R.isDeleted(SI))
17740 continue;
17741 if (!PrevValTy)
17742 PrevValTy = SI->getValueOperand()->getType();
17743 // Check that we do not try to vectorize stores of different types.
17744 if (PrevValTy != SI->getValueOperand()->getType()) {
17745 for (auto &Set : SortedStores)
17746 TryToVectorize(Set.second);
17747 SortedStores.clear();
17748 PrevValTy = SI->getValueOperand()->getType();
17749 }
17750 FillStoresSet(I, SI);
17751 }
17752
17753 // Final vectorization attempt.
17754 for (auto &Set : SortedStores)
17755 TryToVectorize(Set.second);
17756
17757 return Changed;
17758}
17759
17760void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
17761 // Initialize the collections. We will make a single pass over the block.
17762 Stores.clear();
17763 GEPs.clear();
17764
17765 // Visit the store and getelementptr instructions in BB and organize them in
17766 // Stores and GEPs according to the underlying objects of their pointer
17767 // operands.
17768 for (Instruction &I : *BB) {
17769 // Ignore store instructions that are volatile or have a pointer operand
17770 // that doesn't point to a scalar type.
17771 if (auto *SI = dyn_cast<StoreInst>(&I)) {
17772 if (!SI->isSimple())
17773 continue;
17774 if (!isValidElementType(SI->getValueOperand()->getType()))
17775 continue;
17776 Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);
17777 }
17778
17779 // Ignore getelementptr instructions that have more than one index, a
17780 // constant index, or a pointer operand that doesn't point to a scalar
17781 // type.
17782 else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
17783 if (GEP->getNumIndices() != 1)
17784 continue;
17785 Value *Idx = GEP->idx_begin()->get();
17786 if (isa<Constant>(Idx))
17787 continue;
17788 if (!isValidElementType(Idx->getType()))
17789 continue;
17790 if (GEP->getType()->isVectorTy())
17791 continue;
17792 GEPs[GEP->getPointerOperand()].push_back(GEP);
17793 }
17794 }
17795}
17796
17797bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
17798 bool MaxVFOnly) {
17799 if (VL.size() < 2)
17800 return false;
17801
17802 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
17803 << VL.size() << ".\n");
17804
17805 // Check that all of the parts are instructions of the same type,
17806 // we permit an alternate opcode via InstructionsState.
17807 InstructionsState S = getSameOpcode(VL, *TLI);
17808 if (!S.getOpcode())
17809 return false;
17810
17811 Instruction *I0 = cast<Instruction>(S.OpValue);
17812 // Make sure invalid types (including vector type) are rejected before
17813 // determining vectorization factor for scalar instructions.
17814 for (Value *V : VL) {
17815 Type *Ty = V->getType();
17817 // NOTE: the following will give user internal llvm type name, which may
17818 // not be useful.
17819 R.getORE()->emit([&]() {
17820 std::string TypeStr;
17821 llvm::raw_string_ostream rso(TypeStr);
17822 Ty->print(rso);
17823 return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
17824 << "Cannot SLP vectorize list: type "
17825 << TypeStr + " is unsupported by vectorizer";
17826 });
17827 return false;
17828 }
17829 }
17830
17831 unsigned Sz = R.getVectorElementSize(I0);
17832 unsigned MinVF = R.getMinVF(Sz);
17833 unsigned MaxVF = std::max<unsigned>(llvm::bit_floor(VL.size()), MinVF);
17834 MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
17835 if (MaxVF < 2) {
17836 R.getORE()->emit([&]() {
17837 return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
17838 << "Cannot SLP vectorize list: vectorization factor "
17839 << "less than 2 is not supported";
17840 });
17841 return false;
17842 }
17843
17844 bool Changed = false;
17845 bool CandidateFound = false;
17847 Type *ScalarTy = getValueType(VL[0]);
17848
17849 unsigned NextInst = 0, MaxInst = VL.size();
17850 for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
17851 // No actual vectorization should happen, if number of parts is the same as
17852 // provided vectorization factor (i.e. the scalar type is used for vector
17853 // code during codegen).
17854 auto *VecTy = getWidenedType(ScalarTy, VF);
17855 if (TTI->getNumberOfParts(VecTy) == VF)
17856 continue;
17857 for (unsigned I = NextInst; I < MaxInst; ++I) {
17858 unsigned ActualVF = std::min(MaxInst - I, VF);
17859
17860 if (!hasFullVectorsOrPowerOf2(*TTI, ScalarTy, ActualVF))
17861 continue;
17862
17863 if (MaxVFOnly && ActualVF < MaxVF)
17864 break;
17865 if ((VF > MinVF && ActualVF <= VF / 2) || (VF == MinVF && ActualVF < 2))
17866 break;
17867
17868 ArrayRef<Value *> Ops = VL.slice(I, ActualVF);
17869 // Check that a previous iteration of this loop did not delete the Value.
17870 if (llvm::any_of(Ops, [&R](Value *V) {
17871 auto *I = dyn_cast<Instruction>(V);
17872 return I && R.isDeleted(I);
17873 }))
17874 continue;
17875
17876 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "
17877 << "\n");
17878
17879 R.buildTree(Ops);
17880 if (R.isTreeTinyAndNotFullyVectorizable())
17881 continue;
17882 R.reorderTopToBottom();
17883 R.reorderBottomToTop(
17884 /*IgnoreReorder=*/!isa<InsertElementInst>(Ops.front()) &&
17885 !R.doesRootHaveInTreeUses());
17886 R.transformNodes();
17887 R.buildExternalUses();
17888
17889 R.computeMinimumValueSizes();
17890 InstructionCost Cost = R.getTreeCost();
17891 CandidateFound = true;
17892 MinCost = std::min(MinCost, Cost);
17893
17894 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
17895 << " for VF=" << ActualVF << "\n");
17896 if (Cost < -SLPCostThreshold) {
17897 LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
17898 R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
17899 cast<Instruction>(Ops[0]))
17900 << "SLP vectorized with cost " << ore::NV("Cost", Cost)
17901 << " and with tree size "
17902 << ore::NV("TreeSize", R.getTreeSize()));
17903
17904 R.vectorizeTree();
17905 // Move to the next bundle.
17906 I += VF - 1;
17907 NextInst = I + 1;
17908 Changed = true;
17909 }
17910 }
17911 }
17912
17913 if (!Changed && CandidateFound) {
17914 R.getORE()->emit([&]() {
17915 return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
17916 << "List vectorization was possible but not beneficial with cost "
17917 << ore::NV("Cost", MinCost) << " >= "
17918 << ore::NV("Treshold", -SLPCostThreshold);
17919 });
17920 } else if (!Changed) {
17921 R.getORE()->emit([&]() {
17922 return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
17923 << "Cannot SLP vectorize list: vectorization was impossible"
17924 << " with available vectorization factors";
17925 });
17926 }
17927 return Changed;
17928}
17929
17930bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
17931 if (!I)
17932 return false;
17933
17934 if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
17935 return false;
17936
17937 Value *P = I->getParent();
17938
17939 // Vectorize in current basic block only.
17940 auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
17941 auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
17942 if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P)
17943 return false;
17944
17945 // First collect all possible candidates
17947 Candidates.emplace_back(Op0, Op1);
17948
17949 auto *A = dyn_cast<BinaryOperator>(Op0);
17950 auto *B = dyn_cast<BinaryOperator>(Op1);
17951 // Try to skip B.
17952 if (A && B && B->hasOneUse()) {
17953 auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
17954 auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
17955 if (B0 && B0->getParent() == P)
17956 Candidates.emplace_back(A, B0);
17957 if (B1 && B1->getParent() == P)
17958 Candidates.emplace_back(A, B1);
17959 }
17960 // Try to skip A.
17961 if (B && A && A->hasOneUse()) {
17962 auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
17963 auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
17964 if (A0 && A0->getParent() == P)
17965 Candidates.emplace_back(A0, B);
17966 if (A1 && A1->getParent() == P)
17967 Candidates.emplace_back(A1, B);
17968 }
17969
17970 if (Candidates.size() == 1)
17971 return tryToVectorizeList({Op0, Op1}, R);
17972
17973 // We have multiple options. Try to pick the single best.
17974 std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
17975 if (!BestCandidate)
17976 return false;
17977 return tryToVectorizeList(
17978 {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second}, R);
17979}
17980
17981namespace {
17982
17983/// Model horizontal reductions.
17984///
17985/// A horizontal reduction is a tree of reduction instructions that has values
17986/// that can be put into a vector as its leaves. For example:
17987///
17988/// mul mul mul mul
17989/// \ / \ /
17990/// + +
17991/// \ /
17992/// +
17993/// This tree has "mul" as its leaf values and "+" as its reduction
17994/// instructions. A reduction can feed into a store or a binary operation
17995/// feeding a phi.
17996/// ...
17997/// \ /
17998/// +
17999/// |
18000/// phi +=
18001///
18002/// Or:
18003/// ...
18004/// \ /
18005/// +
18006/// |
18007/// *p =
18008///
18009class HorizontalReduction {
18010 using ReductionOpsType = SmallVector<Value *, 16>;
18011 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
18012 ReductionOpsListType ReductionOps;
18013 /// List of possibly reduced values.
18015 /// Maps reduced value to the corresponding reduction operation.
18017 WeakTrackingVH ReductionRoot;
18018 /// The type of reduction operation.
18019 RecurKind RdxKind;
18020 /// Checks if the optimization of original scalar identity operations on
18021 /// matched horizontal reductions is enabled and allowed.
18022 bool IsSupportedHorRdxIdentityOp = false;
18023
18024 static bool isCmpSelMinMax(Instruction *I) {
18025 return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
18027 }
18028
18029 // And/or are potentially poison-safe logical patterns like:
18030 // select x, y, false
18031 // select x, true, y
18032 static bool isBoolLogicOp(Instruction *I) {
18033 return isa<SelectInst>(I) &&
18034 (match(I, m_LogicalAnd()) || match(I, m_LogicalOr()));
18035 }
18036
18037 /// Checks if instruction is associative and can be vectorized.
18038 static bool isVectorizable(RecurKind Kind, Instruction *I) {
18039 if (Kind == RecurKind::None)
18040 return false;
18041
18042 // Integer ops that map to select instructions or intrinsics are fine.
18044 isBoolLogicOp(I))
18045 return true;
18046
18047 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
18048 // FP min/max are associative except for NaN and -0.0. We do not
18049 // have to rule out -0.0 here because the intrinsic semantics do not
18050 // specify a fixed result for it.
18051 return I->getFastMathFlags().noNaNs();
18052 }
18053
18054 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
18055 return true;
18056
18057 return I->isAssociative();
18058 }
18059
18060 static Value *getRdxOperand(Instruction *I, unsigned Index) {
18061 // Poison-safe 'or' takes the form: select X, true, Y
18062 // To make that work with the normal operand processing, we skip the
18063 // true value operand.
18064 // TODO: Change the code and data structures to handle this without a hack.
18065 if (getRdxKind(I) == RecurKind::Or && isa<SelectInst>(I) && Index == 1)
18066 return I->getOperand(2);
18067 return I->getOperand(Index);
18068 }
18069
18070 /// Creates reduction operation with the current opcode.
18071 static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS,
18072 Value *RHS, const Twine &Name, bool UseSelect) {
18073 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
18074 switch (Kind) {
18075 case RecurKind::Or:
18076 if (UseSelect &&
18077 LHS->getType() == CmpInst::makeCmpResultType(LHS->getType()))
18078 return Builder.CreateSelect(LHS, Builder.getTrue(), RHS, Name);
18079 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
18080 Name);
18081 case RecurKind::And:
18082 if (UseSelect &&
18083 LHS->getType() == CmpInst::makeCmpResultType(LHS->getType()))
18084 return Builder.CreateSelect(LHS, RHS, Builder.getFalse(), Name);
18085 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
18086 Name);
18087 case RecurKind::Add:
18088 case RecurKind::Mul:
18089 case RecurKind::Xor:
18090 case RecurKind::FAdd:
18091 case RecurKind::FMul:
18092 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
18093 Name);
18094 case RecurKind::FMax:
18095 return Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS);
18096 case RecurKind::FMin:
18097 return Builder.CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS);
18098 case RecurKind::FMaximum:
18099 return Builder.CreateBinaryIntrinsic(Intrinsic::maximum, LHS, RHS);
18100 case RecurKind::FMinimum:
18101 return Builder.CreateBinaryIntrinsic(Intrinsic::minimum, LHS, RHS);
18102 case RecurKind::SMax:
18103 if (UseSelect) {
18104 Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name);
18105 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
18106 }
18107 return Builder.CreateBinaryIntrinsic(Intrinsic::smax, LHS, RHS);
18108 case RecurKind::SMin:
18109 if (UseSelect) {
18110 Value *Cmp = Builder.CreateICmpSLT(LHS, RHS, Name);
18111 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
18112 }
18113 return Builder.CreateBinaryIntrinsic(Intrinsic::smin, LHS, RHS);
18114 case RecurKind::UMax:
18115 if (UseSelect) {
18116 Value *Cmp = Builder.CreateICmpUGT(LHS, RHS, Name);
18117 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
18118 }
18119 return Builder.CreateBinaryIntrinsic(Intrinsic::umax, LHS, RHS);
18120 case RecurKind::UMin:
18121 if (UseSelect) {
18122 Value *Cmp = Builder.CreateICmpULT(LHS, RHS, Name);
18123 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
18124 }
18125 return Builder.CreateBinaryIntrinsic(Intrinsic::umin, LHS, RHS);
18126 default:
18127 llvm_unreachable("Unknown reduction operation.");
18128 }
18129 }
18130
18131 /// Creates reduction operation with the current opcode with the IR flags
18132 /// from \p ReductionOps, dropping nuw/nsw flags.
18133 static Value *createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value *LHS,
18134 Value *RHS, const Twine &Name,
18135 const ReductionOpsListType &ReductionOps) {
18136 bool UseSelect = ReductionOps.size() == 2 ||
18137 // Logical or/and.
18138 (ReductionOps.size() == 1 &&
18139 any_of(ReductionOps.front(), IsaPred<SelectInst>));
18140 assert((!UseSelect || ReductionOps.size() != 2 ||
18141 isa<SelectInst>(ReductionOps[1][0])) &&
18142 "Expected cmp + select pairs for reduction");
18143 Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect);
18145 if (auto *Sel = dyn_cast<SelectInst>(Op)) {
18146 propagateIRFlags(Sel->getCondition(), ReductionOps[0], nullptr,
18147 /*IncludeWrapFlags=*/false);
18148 propagateIRFlags(Op, ReductionOps[1], nullptr,
18149 /*IncludeWrapFlags=*/false);
18150 return Op;
18151 }
18152 }
18153 propagateIRFlags(Op, ReductionOps[0], nullptr, /*IncludeWrapFlags=*/false);
18154 return Op;
18155 }
18156
18157public:
18158 static RecurKind getRdxKind(Value *V) {
18159 auto *I = dyn_cast<Instruction>(V);
18160 if (!I)
18161 return RecurKind::None;
18162 if (match(I, m_Add(m_Value(), m_Value())))
18163 return RecurKind::Add;
18164 if (match(I, m_Mul(m_Value(), m_Value())))
18165 return RecurKind::Mul;
18166 if (match(I, m_And(m_Value(), m_Value())) ||
18168 return RecurKind::And;
18169 if (match(I, m_Or(m_Value(), m_Value())) ||
18171 return RecurKind::Or;
18172 if (match(I, m_Xor(m_Value(), m_Value())))
18173 return RecurKind::Xor;
18174 if (match(I, m_FAdd(m_Value(), m_Value())))
18175 return RecurKind::FAdd;
18176 if (match(I, m_FMul(m_Value(), m_Value())))
18177 return RecurKind::FMul;
18178
18180 return RecurKind::FMax;
18182 return RecurKind::FMin;
18183
18185 return RecurKind::FMaximum;
18187 return RecurKind::FMinimum;
18188 // This matches either cmp+select or intrinsics. SLP is expected to handle
18189 // either form.
18190 // TODO: If we are canonicalizing to intrinsics, we can remove several
18191 // special-case paths that deal with selects.
18192 if (match(I, m_SMax(m_Value(), m_Value())))
18193 return RecurKind::SMax;
18194 if (match(I, m_SMin(m_Value(), m_Value())))
18195 return RecurKind::SMin;
18196 if (match(I, m_UMax(m_Value(), m_Value())))
18197 return RecurKind::UMax;
18198 if (match(I, m_UMin(m_Value(), m_Value())))
18199 return RecurKind::UMin;
18200
18201 if (auto *Select = dyn_cast<SelectInst>(I)) {
18202 // Try harder: look for min/max pattern based on instructions producing
18203 // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
18204 // During the intermediate stages of SLP, it's very common to have
18205 // pattern like this (since optimizeGatherSequence is run only once
18206 // at the end):
18207 // %1 = extractelement <2 x i32> %a, i32 0
18208 // %2 = extractelement <2 x i32> %a, i32 1
18209 // %cond = icmp sgt i32 %1, %2
18210 // %3 = extractelement <2 x i32> %a, i32 0
18211 // %4 = extractelement <2 x i32> %a, i32 1
18212 // %select = select i1 %cond, i32 %3, i32 %4
18213 CmpInst::Predicate Pred;
18214 Instruction *L1;
18215 Instruction *L2;
18216
18217 Value *LHS = Select->getTrueValue();
18218 Value *RHS = Select->getFalseValue();
18219 Value *Cond = Select->getCondition();
18220
18221 // TODO: Support inverse predicates.
18222 if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
18223 if (!isa<ExtractElementInst>(RHS) ||
18225 return RecurKind::None;
18226 } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
18227 if (!isa<ExtractElementInst>(LHS) ||
18229 return RecurKind::None;
18230 } else {
18232 return RecurKind::None;
18233 if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
18234 !L1->isIdenticalTo(cast<Instruction>(LHS)) ||
18236 return RecurKind::None;
18237 }
18238
18239 switch (Pred) {
18240 default:
18241 return RecurKind::None;
18242 case CmpInst::ICMP_SGT:
18243 case CmpInst::ICMP_SGE:
18244 return RecurKind::SMax;
18245 case CmpInst::ICMP_SLT:
18246 case CmpInst::ICMP_SLE:
18247 return RecurKind::SMin;
18248 case CmpInst::ICMP_UGT:
18249 case CmpInst::ICMP_UGE:
18250 return RecurKind::UMax;
18251 case CmpInst::ICMP_ULT:
18252 case CmpInst::ICMP_ULE:
18253 return RecurKind::UMin;
18254 }
18255 }
18256 return RecurKind::None;
18257 }
18258
18259 /// Get the index of the first operand.
18260 static unsigned getFirstOperandIndex(Instruction *I) {
18261 return isCmpSelMinMax(I) ? 1 : 0;
18262 }
18263
18264private:
18265 /// Total number of operands in the reduction operation.
18266 static unsigned getNumberOfOperands(Instruction *I) {
18267 return isCmpSelMinMax(I) ? 3 : 2;
18268 }
18269
18270 /// Checks if the instruction is in basic block \p BB.
18271 /// For a cmp+sel min/max reduction check that both ops are in \p BB.
18272 static bool hasSameParent(Instruction *I, BasicBlock *BB) {
18273 if (isCmpSelMinMax(I) || isBoolLogicOp(I)) {
18274 auto *Sel = cast<SelectInst>(I);
18275 auto *Cmp = dyn_cast<Instruction>(Sel->getCondition());
18276 return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;
18277 }
18278 return I->getParent() == BB;
18279 }
18280
18281 /// Expected number of uses for reduction operations/reduced values.
18282 static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) {
18283 if (IsCmpSelMinMax) {
18284 // SelectInst must be used twice while the condition op must have single
18285 // use only.
18286 if (auto *Sel = dyn_cast<SelectInst>(I))
18287 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
18288 return I->hasNUses(2);
18289 }
18290
18291 // Arithmetic reduction operation must be used once only.
18292 return I->hasOneUse();
18293 }
18294
18295 /// Initializes the list of reduction operations.
18296 void initReductionOps(Instruction *I) {
18297 if (isCmpSelMinMax(I))
18298 ReductionOps.assign(2, ReductionOpsType());
18299 else
18300 ReductionOps.assign(1, ReductionOpsType());
18301 }
18302
18303 /// Add all reduction operations for the reduction instruction \p I.
18304 void addReductionOps(Instruction *I) {
18305 if (isCmpSelMinMax(I)) {
18306 ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
18307 ReductionOps[1].emplace_back(I);
18308 } else {
18309 ReductionOps[0].emplace_back(I);
18310 }
18311 }
18312
18313 static bool isGoodForReduction(ArrayRef<Value *> Data) {
18314 int Sz = Data.size();
18315 auto *I = dyn_cast<Instruction>(Data.front());
18316 return Sz > 1 || isConstant(Data.front()) ||
18317 (I && !isa<LoadInst>(I) && isValidForAlternation(I->getOpcode()));
18318 }
18319
18320public:
18321 HorizontalReduction() = default;
18322
18323 /// Try to find a reduction tree.
18324 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
18325 ScalarEvolution &SE, const DataLayout &DL,
18326 const TargetLibraryInfo &TLI) {
18327 RdxKind = HorizontalReduction::getRdxKind(Root);
18328 if (!isVectorizable(RdxKind, Root))
18329 return false;
18330
18331 // Analyze "regular" integer/FP types for reductions - no target-specific
18332 // types or pointers.
18333 Type *Ty = Root->getType();
18334 if (!isValidElementType(Ty) || Ty->isPointerTy())
18335 return false;
18336
18337 // Though the ultimate reduction may have multiple uses, its condition must
18338 // have only single use.
18339 if (auto *Sel = dyn_cast<SelectInst>(Root))
18340 if (!Sel->getCondition()->hasOneUse())
18341 return false;
18342
18343 ReductionRoot = Root;
18344
18345 // Iterate through all the operands of the possible reduction tree and
18346 // gather all the reduced values, sorting them by their value id.
18347 BasicBlock *BB = Root->getParent();
18348 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
18350 1, std::make_pair(Root, 0));
18351 // Checks if the operands of the \p TreeN instruction are also reduction
18352 // operations or should be treated as reduced values or an extra argument,
18353 // which is not part of the reduction.
18354 auto CheckOperands = [&](Instruction *TreeN,
18355 SmallVectorImpl<Value *> &PossibleReducedVals,
18356 SmallVectorImpl<Instruction *> &ReductionOps,
18357 unsigned Level) {
18358 for (int I : reverse(seq<int>(getFirstOperandIndex(TreeN),
18359 getNumberOfOperands(TreeN)))) {
18360 Value *EdgeVal = getRdxOperand(TreeN, I);
18361 ReducedValsToOps[EdgeVal].push_back(TreeN);
18362 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
18363 // If the edge is not an instruction, or it is different from the main
18364 // reduction opcode or has too many uses - possible reduced value.
18365 // Also, do not try to reduce const values, if the operation is not
18366 // foldable.
18367 if (!EdgeInst || Level > RecursionMaxDepth ||
18368 getRdxKind(EdgeInst) != RdxKind ||
18369 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
18370 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
18371 !isVectorizable(RdxKind, EdgeInst) ||
18372 (R.isAnalyzedReductionRoot(EdgeInst) &&
18373 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
18374 PossibleReducedVals.push_back(EdgeVal);
18375 continue;
18376 }
18377 ReductionOps.push_back(EdgeInst);
18378 }
18379 };
18380 // Try to regroup reduced values so that it gets more profitable to try to
18381 // reduce them. Values are grouped by their value ids, instructions - by
18382 // instruction op id and/or alternate op id, plus do extra analysis for
18383 // loads (grouping them by the distabce between pointers) and cmp
18384 // instructions (grouping them by the predicate).
18387 8>
18388 PossibleReducedVals;
18389 initReductionOps(Root);
18391 SmallSet<size_t, 2> LoadKeyUsed;
18392
18393 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
18394 Key = hash_combine(hash_value(LI->getParent()), Key);
18396 if (LoadKeyUsed.contains(Key)) {
18397 auto LIt = LoadsMap.find(Ptr);
18398 if (LIt != LoadsMap.end()) {
18399 for (LoadInst *RLI : LIt->second) {
18400 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
18401 LI->getType(), LI->getPointerOperand(), DL, SE,
18402 /*StrictCheck=*/true))
18403 return hash_value(RLI->getPointerOperand());
18404 }
18405 for (LoadInst *RLI : LIt->second) {
18407 LI->getPointerOperand(), TLI)) {
18408 hash_code SubKey = hash_value(RLI->getPointerOperand());
18409 return SubKey;
18410 }
18411 }
18412 if (LIt->second.size() > 2) {
18413 hash_code SubKey =
18414 hash_value(LIt->second.back()->getPointerOperand());
18415 return SubKey;
18416 }
18417 }
18418 }
18419 LoadKeyUsed.insert(Key);
18420 LoadsMap.try_emplace(Ptr).first->second.push_back(LI);
18421 return hash_value(LI->getPointerOperand());
18422 };
18423
18424 while (!Worklist.empty()) {
18425 auto [TreeN, Level] = Worklist.pop_back_val();
18426 SmallVector<Value *> PossibleRedVals;
18427 SmallVector<Instruction *> PossibleReductionOps;
18428 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
18429 addReductionOps(TreeN);
18430 // Add reduction values. The values are sorted for better vectorization
18431 // results.
18432 for (Value *V : PossibleRedVals) {
18433 size_t Key, Idx;
18434 std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
18435 /*AllowAlternate=*/false);
18436 ++PossibleReducedVals[Key][Idx]
18437 .insert(std::make_pair(V, 0))
18438 .first->second;
18439 }
18440 for (Instruction *I : reverse(PossibleReductionOps))
18441 Worklist.emplace_back(I, I->getParent() == BB ? 0 : Level + 1);
18442 }
18443 auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
18444 // Sort values by the total number of values kinds to start the reduction
18445 // from the longest possible reduced values sequences.
18446 for (auto &PossibleReducedVals : PossibleReducedValsVect) {
18447 auto PossibleRedVals = PossibleReducedVals.second.takeVector();
18448 SmallVector<SmallVector<Value *>> PossibleRedValsVect;
18449 for (auto It = PossibleRedVals.begin(), E = PossibleRedVals.end();
18450 It != E; ++It) {
18451 PossibleRedValsVect.emplace_back();
18452 auto RedValsVect = It->second.takeVector();
18453 stable_sort(RedValsVect, llvm::less_second());
18454 for (const std::pair<Value *, unsigned> &Data : RedValsVect)
18455 PossibleRedValsVect.back().append(Data.second, Data.first);
18456 }
18457 stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
18458 return P1.size() > P2.size();
18459 });
18460 int NewIdx = -1;
18461 for (ArrayRef<Value *> Data : PossibleRedValsVect) {
18462 if (NewIdx < 0 ||
18463 (!isGoodForReduction(Data) &&
18464 (!isa<LoadInst>(Data.front()) ||
18465 !isa<LoadInst>(ReducedVals[NewIdx].front()) ||
18467 cast<LoadInst>(Data.front())->getPointerOperand()) !=
18469 cast<LoadInst>(ReducedVals[NewIdx].front())
18470 ->getPointerOperand())))) {
18471 NewIdx = ReducedVals.size();
18472 ReducedVals.emplace_back();
18473 }
18474 ReducedVals[NewIdx].append(Data.rbegin(), Data.rend());
18475 }
18476 }
18477 // Sort the reduced values by number of same/alternate opcode and/or pointer
18478 // operand.
18479 stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
18480 return P1.size() > P2.size();
18481 });
18482 return true;
18483 }
18484
18485 /// Attempt to vectorize the tree found by matchAssociativeReduction.
18486 Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
18487 const TargetLibraryInfo &TLI) {
18488 const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4;
18489 constexpr unsigned RegMaxNumber = 4;
18490 constexpr unsigned RedValsMaxNumber = 128;
18491 // If there are a sufficient number of reduction values, reduce
18492 // to a nearby power-of-2. We can safely generate oversized
18493 // vectors and rely on the backend to split them to legal sizes.
18494 if (unsigned NumReducedVals = std::accumulate(
18495 ReducedVals.begin(), ReducedVals.end(), 0,
18496 [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned {
18497 if (!isGoodForReduction(Vals))
18498 return Num;
18499 return Num + Vals.size();
18500 });
18501 NumReducedVals < ReductionLimit &&
18502 all_of(ReducedVals, [](ArrayRef<Value *> RedV) {
18503 return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV);
18504 })) {
18505 for (ReductionOpsType &RdxOps : ReductionOps)
18506 for (Value *RdxOp : RdxOps)
18507 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
18508 return nullptr;
18509 }
18510
18511 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
18512 TargetFolder(DL));
18513 Builder.SetInsertPoint(cast<Instruction>(ReductionRoot));
18514
18515 // Track the reduced values in case if they are replaced by extractelement
18516 // because of the vectorization.
18517 DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.size() *
18518 ReducedVals.front().size());
18519
18520 // The compare instruction of a min/max is the insertion point for new
18521 // instructions and may be replaced with a new compare instruction.
18522 auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
18523 assert(isa<SelectInst>(RdxRootInst) &&
18524 "Expected min/max reduction to have select root instruction");
18525 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
18526 assert(isa<Instruction>(ScalarCond) &&
18527 "Expected min/max reduction to have compare condition");
18528 return cast<Instruction>(ScalarCond);
18529 };
18530
18531 // Return new VectorizedTree, based on previous value.
18532 auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {
18533 if (VectorizedTree) {
18534 // Update the final value in the reduction.
18535 Builder.SetCurrentDebugLocation(
18536 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
18537 if ((isa<PoisonValue>(VectorizedTree) && !isa<PoisonValue>(Res)) ||
18539 !isGuaranteedNotToBePoison(VectorizedTree))) {
18540 auto It = ReducedValsToOps.find(Res);
18541 if (It != ReducedValsToOps.end() &&
18542 any_of(It->getSecond(),
18543 [](Instruction *I) { return isBoolLogicOp(I); }))
18544 std::swap(VectorizedTree, Res);
18545 }
18546
18547 return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx",
18548 ReductionOps);
18549 }
18550 // Initialize the final value in the reduction.
18551 return Res;
18552 };
18553 bool AnyBoolLogicOp = any_of(ReductionOps.back(), [](Value *V) {
18554 return isBoolLogicOp(cast<Instruction>(V));
18555 });
18556 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
18557 ReductionOps.front().size());
18558 for (ReductionOpsType &RdxOps : ReductionOps)
18559 for (Value *RdxOp : RdxOps) {
18560 if (!RdxOp)
18561 continue;
18562 IgnoreList.insert(RdxOp);
18563 }
18564 // Intersect the fast-math-flags from all reduction operations.
18565 FastMathFlags RdxFMF;
18566 RdxFMF.set();
18567 for (Value *U : IgnoreList)
18568 if (auto *FPMO = dyn_cast<FPMathOperator>(U))
18569 RdxFMF &= FPMO->getFastMathFlags();
18570 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
18571
18572 // Need to track reduced vals, they may be changed during vectorization of
18573 // subvectors.
18574 for (ArrayRef<Value *> Candidates : ReducedVals)
18575 for (Value *V : Candidates)
18576 TrackedVals.try_emplace(V, V);
18577
18579 Value *V) -> unsigned & {
18580 auto *It = MV.find(V);
18581 assert(It != MV.end() && "Unable to find given key.");
18582 return It->second;
18583 };
18584
18585 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());
18586 // List of the values that were reduced in other trees as part of gather
18587 // nodes and thus requiring extract if fully vectorized in other trees.
18588 SmallPtrSet<Value *, 4> RequiredExtract;
18589 Value *VectorizedTree = nullptr;
18590 bool CheckForReusedReductionOps = false;
18591 // Try to vectorize elements based on their type.
18593 for (ArrayRef<Value *> RV : ReducedVals)
18594 States.push_back(getSameOpcode(RV, TLI));
18595 for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
18596 ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
18597 InstructionsState S = States[I];
18598 SmallVector<Value *> Candidates;
18599 Candidates.reserve(2 * OrigReducedVals.size());
18600 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
18601 for (unsigned Cnt = 0, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) {
18602 Value *RdxVal = TrackedVals.at(OrigReducedVals[Cnt]);
18603 // Check if the reduction value was not overriden by the extractelement
18604 // instruction because of the vectorization and exclude it, if it is not
18605 // compatible with other values.
18606 // Also check if the instruction was folded to constant/other value.
18607 auto *Inst = dyn_cast<Instruction>(RdxVal);
18608 if ((Inst && isVectorLikeInstWithConstOps(Inst) &&
18609 (!S.getOpcode() || !S.isOpcodeOrAlt(Inst))) ||
18610 (S.getOpcode() && !Inst))
18611 continue;
18612 Candidates.push_back(RdxVal);
18613 TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
18614 }
18615 bool ShuffledExtracts = false;
18616 // Try to handle shuffled extractelements.
18617 if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() &&
18618 I + 1 < E) {
18619 SmallVector<Value *> CommonCandidates(Candidates);
18620 for (Value *RV : ReducedVals[I + 1]) {
18621 Value *RdxVal = TrackedVals.at(RV);
18622 // Check if the reduction value was not overriden by the
18623 // extractelement instruction because of the vectorization and
18624 // exclude it, if it is not compatible with other values.
18625 auto *Inst = dyn_cast<ExtractElementInst>(RdxVal);
18626 if (!Inst)
18627 continue;
18628 CommonCandidates.push_back(RdxVal);
18629 TrackedToOrig.try_emplace(RdxVal, RV);
18630 }
18632 if (isFixedVectorShuffle(CommonCandidates, Mask)) {
18633 ++I;
18634 Candidates.swap(CommonCandidates);
18635 ShuffledExtracts = true;
18636 }
18637 }
18638
18639 // Emit code for constant values.
18640 if (Candidates.size() > 1 && allConstant(Candidates)) {
18641 Value *Res = Candidates.front();
18642 Value *OrigV = TrackedToOrig.at(Candidates.front());
18643 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
18644 for (Value *VC : ArrayRef(Candidates).drop_front()) {
18645 Res = createOp(Builder, RdxKind, Res, VC, "const.rdx", ReductionOps);
18646 Value *OrigV = TrackedToOrig.at(VC);
18647 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
18648 if (auto *ResI = dyn_cast<Instruction>(Res))
18649 V.analyzedReductionRoot(ResI);
18650 }
18651 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
18652 continue;
18653 }
18654
18655 unsigned NumReducedVals = Candidates.size();
18656 if (NumReducedVals < ReductionLimit &&
18657 (NumReducedVals < 2 || !isSplat(Candidates)))
18658 continue;
18659
18660 // Check if we support repeated scalar values processing (optimization of
18661 // original scalar identity operations on matched horizontal reductions).
18662 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
18663 RdxKind != RecurKind::FMul &&
18664 RdxKind != RecurKind::FMulAdd;
18665 // Gather same values.
18666 SmallMapVector<Value *, unsigned, 16> SameValuesCounter;
18667 if (IsSupportedHorRdxIdentityOp)
18668 for (Value *V : Candidates) {
18669 Value *OrigV = TrackedToOrig.at(V);
18670 ++SameValuesCounter.try_emplace(OrigV).first->second;
18671 }
18672 // Used to check if the reduced values used same number of times. In this
18673 // case the compiler may produce better code. E.g. if reduced values are
18674 // aabbccdd (8 x values), then the first node of the tree will have a node
18675 // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
18676 // Plus, the final reduction will be performed on <8 x aabbccdd>.
18677 // Instead compiler may build <4 x abcd> tree immediately, + reduction (4
18678 // x abcd) * 2.
18679 // Currently it only handles add/fadd/xor. and/or/min/max do not require
18680 // this analysis, other operations may require an extra estimation of
18681 // the profitability.
18682 bool SameScaleFactor = false;
18683 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
18684 SameValuesCounter.size() != Candidates.size();
18685 if (OptReusedScalars) {
18686 SameScaleFactor =
18687 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
18688 RdxKind == RecurKind::Xor) &&
18689 all_of(drop_begin(SameValuesCounter),
18690 [&SameValuesCounter](const std::pair<Value *, unsigned> &P) {
18691 return P.second == SameValuesCounter.front().second;
18692 });
18693 Candidates.resize(SameValuesCounter.size());
18694 transform(SameValuesCounter, Candidates.begin(),
18695 [&](const auto &P) { return TrackedVals.at(P.first); });
18696 NumReducedVals = Candidates.size();
18697 // Have a reduction of the same element.
18698 if (NumReducedVals == 1) {
18699 Value *OrigV = TrackedToOrig.at(Candidates.front());
18700 unsigned Cnt = At(SameValuesCounter, OrigV);
18701 Value *RedVal =
18702 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
18703 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
18704 VectorizedVals.try_emplace(OrigV, Cnt);
18705 continue;
18706 }
18707 }
18708
18709 unsigned MaxVecRegSize = V.getMaxVecRegSize();
18710 unsigned EltSize = V.getVectorElementSize(Candidates[0]);
18711 const unsigned MaxElts = std::clamp<unsigned>(
18712 llvm::bit_floor(MaxVecRegSize / EltSize), RedValsMaxNumber,
18713 RegMaxNumber * RedValsMaxNumber);
18714
18715 unsigned ReduxWidth = NumReducedVals;
18716 if (!VectorizeNonPowerOf2 || !has_single_bit(ReduxWidth + 1))
18717 ReduxWidth = bit_floor(ReduxWidth);
18718 ReduxWidth = std::min(ReduxWidth, MaxElts);
18719
18720 unsigned Start = 0;
18721 unsigned Pos = Start;
18722 // Restarts vectorization attempt with lower vector factor.
18723 unsigned PrevReduxWidth = ReduxWidth;
18724 bool CheckForReusedReductionOpsLocal = false;
18725 auto &&AdjustReducedVals = [&Pos, &Start, &ReduxWidth, NumReducedVals,
18726 &CheckForReusedReductionOpsLocal,
18727 &PrevReduxWidth, &V,
18728 &IgnoreList](bool IgnoreVL = false) {
18729 bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList);
18730 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
18731 // Check if any of the reduction ops are gathered. If so, worth
18732 // trying again with less number of reduction ops.
18733 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
18734 }
18735 ++Pos;
18736 if (Pos < NumReducedVals - ReduxWidth + 1)
18737 return IsAnyRedOpGathered;
18738 Pos = Start;
18739 ReduxWidth = bit_ceil(ReduxWidth) / 2;
18740 return IsAnyRedOpGathered;
18741 };
18742 bool AnyVectorized = false;
18743 while (Pos < NumReducedVals - ReduxWidth + 1 &&
18744 ReduxWidth >= ReductionLimit) {
18745 // Dependency in tree of the reduction ops - drop this attempt, try
18746 // later.
18747 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
18748 Start == 0) {
18749 CheckForReusedReductionOps = true;
18750 break;
18751 }
18752 PrevReduxWidth = ReduxWidth;
18753 ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);
18754 // Beeing analyzed already - skip.
18755 if (V.areAnalyzedReductionVals(VL)) {
18756 (void)AdjustReducedVals(/*IgnoreVL=*/true);
18757 continue;
18758 }
18759 // Early exit if any of the reduction values were deleted during
18760 // previous vectorization attempts.
18761 if (any_of(VL, [&V](Value *RedVal) {
18762 auto *RedValI = dyn_cast<Instruction>(RedVal);
18763 if (!RedValI)
18764 return false;
18765 return V.isDeleted(RedValI);
18766 }))
18767 break;
18768 V.buildTree(VL, IgnoreList);
18769 if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {
18770 if (!AdjustReducedVals())
18771 V.analyzedReductionVals(VL);
18772 continue;
18773 }
18774 if (V.isLoadCombineReductionCandidate(RdxKind)) {
18775 if (!AdjustReducedVals())
18776 V.analyzedReductionVals(VL);
18777 continue;
18778 }
18779 V.reorderTopToBottom();
18780 // No need to reorder the root node at all.
18781 V.reorderBottomToTop(/*IgnoreReorder=*/true);
18782 // Keep extracted other reduction values, if they are used in the
18783 // vectorization trees.
18784 BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues;
18785 // The reduction root is used as the insertion point for new
18786 // instructions, so set it as externally used to prevent it from being
18787 // deleted.
18788 LocalExternallyUsedValues[ReductionRoot];
18789 for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
18790 if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
18791 continue;
18792 for (Value *V : ReducedVals[Cnt])
18793 if (isa<Instruction>(V))
18794 LocalExternallyUsedValues[TrackedVals[V]];
18795 }
18796 if (!IsSupportedHorRdxIdentityOp) {
18797 // Number of uses of the candidates in the vector of values.
18798 assert(SameValuesCounter.empty() &&
18799 "Reused values counter map is not empty");
18800 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
18801 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
18802 continue;
18803 Value *V = Candidates[Cnt];
18804 Value *OrigV = TrackedToOrig.at(V);
18805 ++SameValuesCounter.try_emplace(OrigV).first->second;
18806 }
18807 }
18808 SmallPtrSet<Value *, 4> VLScalars(VL.begin(), VL.end());
18809 // Gather externally used values.
18811 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
18812 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
18813 continue;
18814 Value *RdxVal = Candidates[Cnt];
18815 if (auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
18816 RdxVal = It->second;
18817 if (!Visited.insert(RdxVal).second)
18818 continue;
18819 // Check if the scalar was vectorized as part of the vectorization
18820 // tree but not the top node.
18821 if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) {
18822 LocalExternallyUsedValues[RdxVal];
18823 continue;
18824 }
18825 Value *OrigV = TrackedToOrig.at(RdxVal);
18826 unsigned NumOps =
18827 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
18828 if (NumOps != ReducedValsToOps.at(OrigV).size())
18829 LocalExternallyUsedValues[RdxVal];
18830 }
18831 // Do not need the list of reused scalars in regular mode anymore.
18832 if (!IsSupportedHorRdxIdentityOp)
18833 SameValuesCounter.clear();
18834 for (Value *RdxVal : VL)
18835 if (RequiredExtract.contains(RdxVal))
18836 LocalExternallyUsedValues[RdxVal];
18837 V.transformNodes();
18838 V.buildExternalUses(LocalExternallyUsedValues);
18839
18840 V.computeMinimumValueSizes();
18841
18842 // Estimate cost.
18843 InstructionCost TreeCost = V.getTreeCost(VL);
18844 InstructionCost ReductionCost =
18845 getReductionCost(TTI, VL, IsCmpSelMinMax, ReduxWidth, RdxFMF);
18846 InstructionCost Cost = TreeCost + ReductionCost;
18847 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
18848 << " for reduction\n");
18849 if (!Cost.isValid())
18850 break;
18851 if (Cost >= -SLPCostThreshold) {
18852 V.getORE()->emit([&]() {
18853 return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial",
18854 ReducedValsToOps.at(VL[0]).front())
18855 << "Vectorizing horizontal reduction is possible "
18856 << "but not beneficial with cost " << ore::NV("Cost", Cost)
18857 << " and threshold "
18858 << ore::NV("Threshold", -SLPCostThreshold);
18859 });
18860 if (!AdjustReducedVals())
18861 V.analyzedReductionVals(VL);
18862 continue;
18863 }
18864
18865 LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
18866 << Cost << ". (HorRdx)\n");
18867 V.getORE()->emit([&]() {
18868 return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction",
18869 ReducedValsToOps.at(VL[0]).front())
18870 << "Vectorized horizontal reduction with cost "
18871 << ore::NV("Cost", Cost) << " and with tree size "
18872 << ore::NV("TreeSize", V.getTreeSize());
18873 });
18874
18875 Builder.setFastMathFlags(RdxFMF);
18876
18877 // Emit a reduction. If the root is a select (min/max idiom), the insert
18878 // point is the compare condition of that select.
18879 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
18880 Instruction *InsertPt = RdxRootInst;
18881 if (IsCmpSelMinMax)
18882 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
18883
18884 // Vectorize a tree.
18885 Value *VectorizedRoot =
18886 V.vectorizeTree(LocalExternallyUsedValues, InsertPt);
18887 // Update TrackedToOrig mapping, since the tracked values might be
18888 // updated.
18889 for (Value *RdxVal : Candidates) {
18890 Value *OrigVal = TrackedToOrig.at(RdxVal);
18891 Value *TransformedRdxVal = TrackedVals.at(OrigVal);
18892 if (TransformedRdxVal != RdxVal)
18893 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
18894 }
18895
18896 Builder.SetInsertPoint(InsertPt);
18897
18898 // To prevent poison from leaking across what used to be sequential,
18899 // safe, scalar boolean logic operations, the reduction operand must be
18900 // frozen.
18901 if ((isBoolLogicOp(RdxRootInst) ||
18902 (AnyBoolLogicOp && VL.size() != TrackedVals.size())) &&
18903 !isGuaranteedNotToBePoison(VectorizedRoot))
18904 VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
18905
18906 // Emit code to correctly handle reused reduced values, if required.
18907 if (OptReusedScalars && !SameScaleFactor) {
18908 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
18909 SameValuesCounter, TrackedToOrig);
18910 }
18911
18912 Value *ReducedSubTree;
18913 Type *ScalarTy = VL.front()->getType();
18914 if (isa<FixedVectorType>(ScalarTy)) {
18915 assert(SLPReVec && "FixedVectorType is not expected.");
18916 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
18917 ReducedSubTree = PoisonValue::get(FixedVectorType::get(
18918 VectorizedRoot->getType()->getScalarType(), ScalarTyNumElements));
18919 for (unsigned I : seq<unsigned>(ScalarTyNumElements)) {
18920 // Do reduction for each lane.
18921 // e.g., do reduce add for
18922 // VL[0] = <4 x Ty> <a, b, c, d>
18923 // VL[1] = <4 x Ty> <e, f, g, h>
18924 // Lane[0] = <2 x Ty> <a, e>
18925 // Lane[1] = <2 x Ty> <b, f>
18926 // Lane[2] = <2 x Ty> <c, g>
18927 // Lane[3] = <2 x Ty> <d, h>
18928 // result[0] = reduce add Lane[0]
18929 // result[1] = reduce add Lane[1]
18930 // result[2] = reduce add Lane[2]
18931 // result[3] = reduce add Lane[3]
18933 createStrideMask(I, ScalarTyNumElements, VL.size());
18934 Value *Lane = Builder.CreateShuffleVector(VectorizedRoot, Mask);
18935 ReducedSubTree = Builder.CreateInsertElement(
18936 ReducedSubTree, emitReduction(Lane, Builder, TTI), I);
18937 }
18938 } else {
18939 ReducedSubTree = emitReduction(VectorizedRoot, Builder, TTI);
18940 }
18941 if (ReducedSubTree->getType() != VL.front()->getType()) {
18942 assert(ReducedSubTree->getType() != VL.front()->getType() &&
18943 "Expected different reduction type.");
18944 ReducedSubTree =
18945 Builder.CreateIntCast(ReducedSubTree, VL.front()->getType(),
18946 V.isSignedMinBitwidthRootNode());
18947 }
18948
18949 // Improved analysis for add/fadd/xor reductions with same scale factor
18950 // for all operands of reductions. We can emit scalar ops for them
18951 // instead.
18952 if (OptReusedScalars && SameScaleFactor)
18953 ReducedSubTree = emitScaleForReusedOps(
18954 ReducedSubTree, Builder, SameValuesCounter.front().second);
18955
18956 VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
18957 // Count vectorized reduced values to exclude them from final reduction.
18958 for (Value *RdxVal : VL) {
18959 Value *OrigV = TrackedToOrig.at(RdxVal);
18960 if (IsSupportedHorRdxIdentityOp) {
18961 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
18962 continue;
18963 }
18964 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
18965 if (!V.isVectorized(RdxVal))
18966 RequiredExtract.insert(RdxVal);
18967 }
18968 Pos += ReduxWidth;
18969 Start = Pos;
18970 ReduxWidth = llvm::bit_floor(NumReducedVals - Pos);
18971 AnyVectorized = true;
18972 }
18973 if (OptReusedScalars && !AnyVectorized) {
18974 for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {
18975 Value *RdxVal = TrackedVals.at(P.first);
18976 Value *RedVal = emitScaleForReusedOps(RdxVal, Builder, P.second);
18977 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
18978 VectorizedVals.try_emplace(P.first, P.second);
18979 }
18980 continue;
18981 }
18982 }
18983 if (VectorizedTree) {
18984 // Reorder operands of bool logical op in the natural order to avoid
18985 // possible problem with poison propagation. If not possible to reorder
18986 // (both operands are originally RHS), emit an extra freeze instruction
18987 // for the LHS operand.
18988 // I.e., if we have original code like this:
18989 // RedOp1 = select i1 ?, i1 LHS, i1 false
18990 // RedOp2 = select i1 RHS, i1 ?, i1 false
18991
18992 // Then, we swap LHS/RHS to create a new op that matches the poison
18993 // semantics of the original code.
18994
18995 // If we have original code like this and both values could be poison:
18996 // RedOp1 = select i1 ?, i1 LHS, i1 false
18997 // RedOp2 = select i1 ?, i1 RHS, i1 false
18998
18999 // Then, we must freeze LHS in the new op.
19000 auto FixBoolLogicalOps = [&, VectorizedTree](Value *&LHS, Value *&RHS,
19001 Instruction *RedOp1,
19002 Instruction *RedOp2,
19003 bool InitStep) {
19004 if (!AnyBoolLogicOp)
19005 return;
19006 if (isBoolLogicOp(RedOp1) &&
19007 ((!InitStep && LHS == VectorizedTree) ||
19008 getRdxOperand(RedOp1, 0) == LHS || isGuaranteedNotToBePoison(LHS)))
19009 return;
19010 if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
19011 getRdxOperand(RedOp2, 0) == RHS ||
19013 std::swap(LHS, RHS);
19014 return;
19015 }
19016 if (LHS != VectorizedTree)
19017 LHS = Builder.CreateFreeze(LHS);
19018 };
19019 // Finish the reduction.
19020 // Need to add extra arguments and not vectorized possible reduction
19021 // values.
19022 // Try to avoid dependencies between the scalar remainders after
19023 // reductions.
19024 auto FinalGen =
19026 bool InitStep) {
19027 unsigned Sz = InstVals.size();
19029 Sz % 2);
19030 for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {
19031 Instruction *RedOp = InstVals[I + 1].first;
19032 Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
19033 Value *RdxVal1 = InstVals[I].second;
19034 Value *StableRdxVal1 = RdxVal1;
19035 auto It1 = TrackedVals.find(RdxVal1);
19036 if (It1 != TrackedVals.end())
19037 StableRdxVal1 = It1->second;
19038 Value *RdxVal2 = InstVals[I + 1].second;
19039 Value *StableRdxVal2 = RdxVal2;
19040 auto It2 = TrackedVals.find(RdxVal2);
19041 if (It2 != TrackedVals.end())
19042 StableRdxVal2 = It2->second;
19043 // To prevent poison from leaking across what used to be
19044 // sequential, safe, scalar boolean logic operations, the
19045 // reduction operand must be frozen.
19046 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,
19047 RedOp, InitStep);
19048 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
19049 StableRdxVal2, "op.rdx", ReductionOps);
19050 ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);
19051 }
19052 if (Sz % 2 == 1)
19053 ExtraReds[Sz / 2] = InstVals.back();
19054 return ExtraReds;
19055 };
19057 ExtraReductions.emplace_back(cast<Instruction>(ReductionRoot),
19058 VectorizedTree);
19060 for (ArrayRef<Value *> Candidates : ReducedVals) {
19061 for (Value *RdxVal : Candidates) {
19062 if (!Visited.insert(RdxVal).second)
19063 continue;
19064 unsigned NumOps = VectorizedVals.lookup(RdxVal);
19065 for (Instruction *RedOp :
19066 ArrayRef(ReducedValsToOps.at(RdxVal)).drop_back(NumOps))
19067 ExtraReductions.emplace_back(RedOp, RdxVal);
19068 }
19069 }
19070 // Iterate through all not-vectorized reduction values/extra arguments.
19071 bool InitStep = true;
19072 while (ExtraReductions.size() > 1) {
19074 FinalGen(ExtraReductions, InitStep);
19075 ExtraReductions.swap(NewReds);
19076 InitStep = false;
19077 }
19078 VectorizedTree = ExtraReductions.front().second;
19079
19080 ReductionRoot->replaceAllUsesWith(VectorizedTree);
19081
19082 // The original scalar reduction is expected to have no remaining
19083 // uses outside the reduction tree itself. Assert that we got this
19084 // correct, replace internal uses with undef, and mark for eventual
19085 // deletion.
19086#ifndef NDEBUG
19087 SmallSet<Value *, 4> IgnoreSet;
19088 for (ArrayRef<Value *> RdxOps : ReductionOps)
19089 IgnoreSet.insert(RdxOps.begin(), RdxOps.end());
19090#endif
19091 for (ArrayRef<Value *> RdxOps : ReductionOps) {
19092 for (Value *Ignore : RdxOps) {
19093 if (!Ignore)
19094 continue;
19095#ifndef NDEBUG
19096 for (auto *U : Ignore->users()) {
19097 assert(IgnoreSet.count(U) &&
19098 "All users must be either in the reduction ops list.");
19099 }
19100#endif
19101 if (!Ignore->use_empty()) {
19102 Value *P = PoisonValue::get(Ignore->getType());
19103 Ignore->replaceAllUsesWith(P);
19104 }
19105 }
19106 V.removeInstructionsAndOperands(RdxOps);
19107 }
19108 } else if (!CheckForReusedReductionOps) {
19109 for (ReductionOpsType &RdxOps : ReductionOps)
19110 for (Value *RdxOp : RdxOps)
19111 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
19112 }
19113 return VectorizedTree;
19114 }
19115
19116private:
19117 /// Calculate the cost of a reduction.
19118 InstructionCost getReductionCost(TargetTransformInfo *TTI,
19119 ArrayRef<Value *> ReducedVals,
19120 bool IsCmpSelMinMax, unsigned ReduxWidth,
19121 FastMathFlags FMF) {
19123 Type *ScalarTy = ReducedVals.front()->getType();
19124 FixedVectorType *VectorTy = getWidenedType(ScalarTy, ReduxWidth);
19125 InstructionCost VectorCost = 0, ScalarCost;
19126 // If all of the reduced values are constant, the vector cost is 0, since
19127 // the reduction value can be calculated at the compile time.
19128 bool AllConsts = allConstant(ReducedVals);
19129 auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
19130 InstructionCost Cost = 0;
19131 // Scalar cost is repeated for N-1 elements.
19132 int Cnt = ReducedVals.size();
19133 for (Value *RdxVal : ReducedVals) {
19134 if (Cnt == 1)
19135 break;
19136 --Cnt;
19137 if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) {
19138 Cost += GenCostFn();
19139 continue;
19140 }
19141 InstructionCost ScalarCost = 0;
19142 for (User *U : RdxVal->users()) {
19143 auto *RdxOp = cast<Instruction>(U);
19144 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
19145 ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
19146 continue;
19147 }
19148 ScalarCost = InstructionCost::getInvalid();
19149 break;
19150 }
19151 if (ScalarCost.isValid())
19152 Cost += ScalarCost;
19153 else
19154 Cost += GenCostFn();
19155 }
19156 return Cost;
19157 };
19158 switch (RdxKind) {
19159 case RecurKind::Add:
19160 case RecurKind::Mul:
19161 case RecurKind::Or:
19162 case RecurKind::And:
19163 case RecurKind::Xor:
19164 case RecurKind::FAdd:
19165 case RecurKind::FMul: {
19166 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
19167 if (!AllConsts) {
19168 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
19169 assert(SLPReVec && "FixedVectorType is not expected.");
19170 unsigned ScalarTyNumElements = VecTy->getNumElements();
19171 for (unsigned I : seq<unsigned>(ReducedVals.size())) {
19172 VectorCost += TTI->getShuffleCost(
19173 TTI::SK_PermuteSingleSrc, VectorTy,
19174 createStrideMask(I, ScalarTyNumElements, ReducedVals.size()));
19175 VectorCost += TTI->getArithmeticReductionCost(RdxOpcode, VecTy, FMF,
19176 CostKind);
19177 }
19178 VectorCost += TTI->getScalarizationOverhead(
19179 VecTy, APInt::getAllOnes(ScalarTyNumElements), /*Insert*/ true,
19180 /*Extract*/ false, TTI::TCK_RecipThroughput);
19181 } else {
19182 VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF,
19183 CostKind);
19184 }
19185 }
19186 ScalarCost = EvaluateScalarCost([&]() {
19187 return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
19188 });
19189 break;
19190 }
19191 case RecurKind::FMax:
19192 case RecurKind::FMin:
19193 case RecurKind::FMaximum:
19194 case RecurKind::FMinimum:
19195 case RecurKind::SMax:
19196 case RecurKind::SMin:
19197 case RecurKind::UMax:
19198 case RecurKind::UMin: {
19200 if (!AllConsts)
19201 VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind);
19202 ScalarCost = EvaluateScalarCost([&]() {
19203 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
19204 return TTI->getIntrinsicInstrCost(ICA, CostKind);
19205 });
19206 break;
19207 }
19208 default:
19209 llvm_unreachable("Expected arithmetic or min/max reduction operation");
19210 }
19211
19212 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
19213 << " for reduction of " << shortBundleName(ReducedVals)
19214 << " (It is a splitting reduction)\n");
19215 return VectorCost - ScalarCost;
19216 }
19217
19218 /// Emit a horizontal reduction of the vectorized value.
19219 Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder,
19220 const TargetTransformInfo *TTI) {
19221 assert(VectorizedValue && "Need to have a vectorized tree node");
19222 assert(RdxKind != RecurKind::FMulAdd &&
19223 "A call to the llvm.fmuladd intrinsic is not handled yet");
19224
19225 ++NumVectorInstructions;
19226 return createSimpleReduction(Builder, VectorizedValue, RdxKind);
19227 }
19228
19229 /// Emits optimized code for unique scalar value reused \p Cnt times.
19230 Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
19231 unsigned Cnt) {
19232 assert(IsSupportedHorRdxIdentityOp &&
19233 "The optimization of matched scalar identity horizontal reductions "
19234 "must be supported.");
19235 if (Cnt == 1)
19236 return VectorizedValue;
19237 switch (RdxKind) {
19238 case RecurKind::Add: {
19239 // res = mul vv, n
19240 Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt);
19241 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "
19242 << VectorizedValue << ". (HorRdx)\n");
19243 return Builder.CreateMul(VectorizedValue, Scale);
19244 }
19245 case RecurKind::Xor: {
19246 // res = n % 2 ? 0 : vv
19247 LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue
19248 << ". (HorRdx)\n");
19249 if (Cnt % 2 == 0)
19250 return Constant::getNullValue(VectorizedValue->getType());
19251 return VectorizedValue;
19252 }
19253 case RecurKind::FAdd: {
19254 // res = fmul v, n
19255 Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt);
19256 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "
19257 << VectorizedValue << ". (HorRdx)\n");
19258 return Builder.CreateFMul(VectorizedValue, Scale);
19259 }
19260 case RecurKind::And:
19261 case RecurKind::Or:
19262 case RecurKind::SMax:
19263 case RecurKind::SMin:
19264 case RecurKind::UMax:
19265 case RecurKind::UMin:
19266 case RecurKind::FMax:
19267 case RecurKind::FMin:
19268 case RecurKind::FMaximum:
19269 case RecurKind::FMinimum:
19270 // res = vv
19271 return VectorizedValue;
19272 case RecurKind::Mul:
19273 case RecurKind::FMul:
19274 case RecurKind::FMulAdd:
19275 case RecurKind::IAnyOf:
19276 case RecurKind::FAnyOf:
19277 case RecurKind::None:
19278 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
19279 }
19280 return nullptr;
19281 }
19282
19283 /// Emits actual operation for the scalar identity values, found during
19284 /// horizontal reduction analysis.
19285 Value *
19286 emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder, BoUpSLP &R,
19287 const SmallMapVector<Value *, unsigned, 16> &SameValuesCounter,
19288 const DenseMap<Value *, Value *> &TrackedToOrig) {
19289 assert(IsSupportedHorRdxIdentityOp &&
19290 "The optimization of matched scalar identity horizontal reductions "
19291 "must be supported.");
19292 ArrayRef<Value *> VL = R.getRootNodeScalars();
19293 auto *VTy = cast<FixedVectorType>(VectorizedValue->getType());
19294 if (VTy->getElementType() != VL.front()->getType()) {
19295 VectorizedValue = Builder.CreateIntCast(
19296 VectorizedValue,
19297 getWidenedType(VL.front()->getType(), VTy->getNumElements()),
19298 R.isSignedMinBitwidthRootNode());
19299 }
19300 switch (RdxKind) {
19301 case RecurKind::Add: {
19302 // root = mul prev_root, <1, 1, n, 1>
19304 for (Value *V : VL) {
19305 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
19306 Vals.push_back(ConstantInt::get(V->getType(), Cnt, /*IsSigned=*/false));
19307 }
19308 auto *Scale = ConstantVector::get(Vals);
19309 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "
19310 << VectorizedValue << ". (HorRdx)\n");
19311 return Builder.CreateMul(VectorizedValue, Scale);
19312 }
19313 case RecurKind::And:
19314 case RecurKind::Or:
19315 // No need for multiple or/and(s).
19316 LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue
19317 << ". (HorRdx)\n");
19318 return VectorizedValue;
19319 case RecurKind::SMax:
19320 case RecurKind::SMin:
19321 case RecurKind::UMax:
19322 case RecurKind::UMin:
19323 case RecurKind::FMax:
19324 case RecurKind::FMin:
19325 case RecurKind::FMaximum:
19326 case RecurKind::FMinimum:
19327 // No need for multiple min/max(s) of the same value.
19328 LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue
19329 << ". (HorRdx)\n");
19330 return VectorizedValue;
19331 case RecurKind::Xor: {
19332 // Replace values with even number of repeats with 0, since
19333 // x xor x = 0.
19334 // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
19335 // 7>, if elements 4th and 6th elements have even number of repeats.
19337 cast<FixedVectorType>(VectorizedValue->getType())->getNumElements(),
19339 std::iota(Mask.begin(), Mask.end(), 0);
19340 bool NeedShuffle = false;
19341 for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
19342 Value *V = VL[I];
19343 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
19344 if (Cnt % 2 == 0) {
19345 Mask[I] = VF;
19346 NeedShuffle = true;
19347 }
19348 }
19349 LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
19350 : Mask) dbgs()
19351 << I << " ";
19352 dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
19353 if (NeedShuffle)
19354 VectorizedValue = Builder.CreateShuffleVector(
19355 VectorizedValue,
19356 ConstantVector::getNullValue(VectorizedValue->getType()), Mask);
19357 return VectorizedValue;
19358 }
19359 case RecurKind::FAdd: {
19360 // root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
19362 for (Value *V : VL) {
19363 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
19364 Vals.push_back(ConstantFP::get(V->getType(), Cnt));
19365 }
19366 auto *Scale = ConstantVector::get(Vals);
19367 return Builder.CreateFMul(VectorizedValue, Scale);
19368 }
19369 case RecurKind::Mul:
19370 case RecurKind::FMul:
19371 case RecurKind::FMulAdd:
19372 case RecurKind::IAnyOf:
19373 case RecurKind::FAnyOf:
19374 case RecurKind::None:
19375 llvm_unreachable("Unexpected reduction kind for reused scalars.");
19376 }
19377 return nullptr;
19378 }
19379};
19380} // end anonymous namespace
19381
19382/// Gets recurrence kind from the specified value.
19384 return HorizontalReduction::getRdxKind(V);
19385}
19386static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {
19387 if (auto *IE = dyn_cast<InsertElementInst>(InsertInst))
19388 return cast<FixedVectorType>(IE->getType())->getNumElements();
19389
19390 unsigned AggregateSize = 1;
19391 auto *IV = cast<InsertValueInst>(InsertInst);
19392 Type *CurrentType = IV->getType();
19393 do {
19394 if (auto *ST = dyn_cast<StructType>(CurrentType)) {
19395 for (auto *Elt : ST->elements())
19396 if (Elt != ST->getElementType(0)) // check homogeneity
19397 return std::nullopt;
19398 AggregateSize *= ST->getNumElements();
19399 CurrentType = ST->getElementType(0);
19400 } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
19401 AggregateSize *= AT->getNumElements();
19402 CurrentType = AT->getElementType();
19403 } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
19404 AggregateSize *= VT->getNumElements();
19405 return AggregateSize;
19406 } else if (CurrentType->isSingleValueType()) {
19407 return AggregateSize;
19408 } else {
19409 return std::nullopt;
19410 }
19411 } while (true);
19412}
19413
19414static void findBuildAggregate_rec(Instruction *LastInsertInst,
19416 SmallVectorImpl<Value *> &BuildVectorOpds,
19417 SmallVectorImpl<Value *> &InsertElts,
19418 unsigned OperandOffset) {
19419 do {
19420 Value *InsertedOperand = LastInsertInst->getOperand(1);
19421 std::optional<unsigned> OperandIndex =
19422 getElementIndex(LastInsertInst, OperandOffset);
19423 if (!OperandIndex)
19424 return;
19425 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
19427 BuildVectorOpds, InsertElts, *OperandIndex);
19428
19429 } else {
19430 BuildVectorOpds[*OperandIndex] = InsertedOperand;
19431 InsertElts[*OperandIndex] = LastInsertInst;
19432 }
19433 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0));
19434 } while (LastInsertInst != nullptr &&
19436 LastInsertInst->hasOneUse());
19437}
19438
19439/// Recognize construction of vectors like
19440/// %ra = insertelement <4 x float> poison, float %s0, i32 0
19441/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
19442/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
19443/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
19444/// starting from the last insertelement or insertvalue instruction.
19445///
19446/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
19447/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
19448/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
19449///
19450/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
19451///
19452/// \return true if it matches.
19453static bool findBuildAggregate(Instruction *LastInsertInst,
19455 SmallVectorImpl<Value *> &BuildVectorOpds,
19456 SmallVectorImpl<Value *> &InsertElts) {
19457
19458 assert((isa<InsertElementInst>(LastInsertInst) ||
19459 isa<InsertValueInst>(LastInsertInst)) &&
19460 "Expected insertelement or insertvalue instruction!");
19461
19462 assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
19463 "Expected empty result vectors!");
19464
19465 std::optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst);
19466 if (!AggregateSize)
19467 return false;
19468 BuildVectorOpds.resize(*AggregateSize);
19469 InsertElts.resize(*AggregateSize);
19470
19471 findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0);
19472 llvm::erase(BuildVectorOpds, nullptr);
19473 llvm::erase(InsertElts, nullptr);
19474 if (BuildVectorOpds.size() >= 2)
19475 return true;
19476
19477 return false;
19478}
19479
19480/// Try and get a reduction instruction from a phi node.
19481///
19482/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
19483/// if they come from either \p ParentBB or a containing loop latch.
19484///
19485/// \returns A candidate reduction value if possible, or \code nullptr \endcode
19486/// if not possible.
19488 BasicBlock *ParentBB, LoopInfo *LI) {
19489 // There are situations where the reduction value is not dominated by the
19490 // reduction phi. Vectorizing such cases has been reported to cause
19491 // miscompiles. See PR25787.
19492 auto DominatedReduxValue = [&](Value *R) {
19493 return isa<Instruction>(R) &&
19494 DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
19495 };
19496
19497 Instruction *Rdx = nullptr;
19498
19499 // Return the incoming value if it comes from the same BB as the phi node.
19500 if (P->getIncomingBlock(0) == ParentBB) {
19501 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
19502 } else if (P->getIncomingBlock(1) == ParentBB) {
19503 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
19504 }
19505
19506 if (Rdx && DominatedReduxValue(Rdx))
19507 return Rdx;
19508
19509 // Otherwise, check whether we have a loop latch to look at.
19510 Loop *BBL = LI->getLoopFor(ParentBB);
19511 if (!BBL)
19512 return nullptr;
19513 BasicBlock *BBLatch = BBL->getLoopLatch();
19514 if (!BBLatch)
19515 return nullptr;
19516
19517 // There is a loop latch, return the incoming value if it comes from
19518 // that. This reduction pattern occasionally turns up.
19519 if (P->getIncomingBlock(0) == BBLatch) {
19520 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
19521 } else if (P->getIncomingBlock(1) == BBLatch) {
19522 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
19523 }
19524
19525 if (Rdx && DominatedReduxValue(Rdx))
19526 return Rdx;
19527
19528 return nullptr;
19529}
19530
19531static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
19532 if (match(I, m_BinOp(m_Value(V0), m_Value(V1))))
19533 return true;
19535 return true;
19537 return true;
19539 return true;
19541 return true;
19543 return true;
19545 return true;
19547 return true;
19549 return true;
19550 return false;
19551}
19552
19553/// We could have an initial reduction that is not an add.
19554/// r *= v1 + v2 + v3 + v4
19555/// In such a case start looking for a tree rooted in the first '+'.
19556/// \Returns the new root if found, which may be nullptr if not an instruction.
19558 Instruction *Root) {
19559 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
19560 isa<IntrinsicInst>(Root)) &&
19561 "Expected binop, select, or intrinsic for reduction matching");
19562 Value *LHS =
19563 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root));
19564 Value *RHS =
19565 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
19566 if (LHS == Phi)
19567 return dyn_cast<Instruction>(RHS);
19568 if (RHS == Phi)
19569 return dyn_cast<Instruction>(LHS);
19570 return nullptr;
19571}
19572
19573/// \p Returns the first operand of \p I that does not match \p Phi. If
19574/// operand is not an instruction it returns nullptr.
19576 Value *Op0 = nullptr;
19577 Value *Op1 = nullptr;
19578 if (!matchRdxBop(I, Op0, Op1))
19579 return nullptr;
19580 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
19581}
19582
19583/// \Returns true if \p I is a candidate instruction for reduction vectorization.
19585 bool IsSelect = match(I, m_Select(m_Value(), m_Value(), m_Value()));
19586 Value *B0 = nullptr, *B1 = nullptr;
19587 bool IsBinop = matchRdxBop(I, B0, B1);
19588 return IsBinop || IsSelect;
19589}
19590
19591bool SLPVectorizerPass::vectorizeHorReduction(
19593 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
19594 if (!ShouldVectorizeHor)
19595 return false;
19596 bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Root);
19597
19598 if (Root->getParent() != BB || isa<PHINode>(Root))
19599 return false;
19600
19601 // If we can find a secondary reduction root, use that instead.
19602 auto SelectRoot = [&]() {
19603 if (TryOperandsAsNewSeeds && isReductionCandidate(Root) &&
19604 HorizontalReduction::getRdxKind(Root) != RecurKind::None)
19605 if (Instruction *NewRoot = tryGetSecondaryReductionRoot(P, Root))
19606 return NewRoot;
19607 return Root;
19608 };
19609
19610 // Start analysis starting from Root instruction. If horizontal reduction is
19611 // found, try to vectorize it. If it is not a horizontal reduction or
19612 // vectorization is not possible or not effective, and currently analyzed
19613 // instruction is a binary operation, try to vectorize the operands, using
19614 // pre-order DFS traversal order. If the operands were not vectorized, repeat
19615 // the same procedure considering each operand as a possible root of the
19616 // horizontal reduction.
19617 // Interrupt the process if the Root instruction itself was vectorized or all
19618 // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
19619 // If a horizintal reduction was not matched or vectorized we collect
19620 // instructions for possible later attempts for vectorization.
19621 std::queue<std::pair<Instruction *, unsigned>> Stack;
19622 Stack.emplace(SelectRoot(), 0);
19623 SmallPtrSet<Value *, 8> VisitedInstrs;
19624 bool Res = false;
19625 auto &&TryToReduce = [this, TTI, &R](Instruction *Inst) -> Value * {
19626 if (R.isAnalyzedReductionRoot(Inst))
19627 return nullptr;
19628 if (!isReductionCandidate(Inst))
19629 return nullptr;
19630 HorizontalReduction HorRdx;
19631 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
19632 return nullptr;
19633 return HorRdx.tryToReduce(R, *DL, TTI, *TLI);
19634 };
19635 auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
19636 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
19637 FutureSeed = getNonPhiOperand(Root, P);
19638 if (!FutureSeed)
19639 return false;
19640 }
19641 // Do not collect CmpInst or InsertElementInst/InsertValueInst as their
19642 // analysis is done separately.
19644 PostponedInsts.push_back(FutureSeed);
19645 return true;
19646 };
19647
19648 while (!Stack.empty()) {
19649 Instruction *Inst;
19650 unsigned Level;
19651 std::tie(Inst, Level) = Stack.front();
19652 Stack.pop();
19653 // Do not try to analyze instruction that has already been vectorized.
19654 // This may happen when we vectorize instruction operands on a previous
19655 // iteration while stack was populated before that happened.
19656 if (R.isDeleted(Inst))
19657 continue;
19658 if (Value *VectorizedV = TryToReduce(Inst)) {
19659 Res = true;
19660 if (auto *I = dyn_cast<Instruction>(VectorizedV)) {
19661 // Try to find another reduction.
19662 Stack.emplace(I, Level);
19663 continue;
19664 }
19665 if (R.isDeleted(Inst))
19666 continue;
19667 } else {
19668 // We could not vectorize `Inst` so try to use it as a future seed.
19669 if (!TryAppendToPostponedInsts(Inst)) {
19670 assert(Stack.empty() && "Expected empty stack");
19671 break;
19672 }
19673 }
19674
19675 // Try to vectorize operands.
19676 // Continue analysis for the instruction from the same basic block only to
19677 // save compile time.
19678 if (++Level < RecursionMaxDepth)
19679 for (auto *Op : Inst->operand_values())
19680 if (VisitedInstrs.insert(Op).second)
19681 if (auto *I = dyn_cast<Instruction>(Op))
19682 // Do not try to vectorize CmpInst operands, this is done
19683 // separately.
19685 !R.isDeleted(I) && I->getParent() == BB)
19686 Stack.emplace(I, Level);
19687 }
19688 return Res;
19689}
19690
19691bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
19692 BasicBlock *BB, BoUpSLP &R,
19694 SmallVector<WeakTrackingVH> PostponedInsts;
19695 bool Res = vectorizeHorReduction(P, Root, BB, R, TTI, PostponedInsts);
19696 Res |= tryToVectorize(PostponedInsts, R);
19697 return Res;
19698}
19699
19700bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
19701 BoUpSLP &R) {
19702 bool Res = false;
19703 for (Value *V : Insts)
19704 if (auto *Inst = dyn_cast<Instruction>(V); Inst && !R.isDeleted(Inst))
19705 Res |= tryToVectorize(Inst, R);
19706 return Res;
19707}
19708
19709bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
19710 BasicBlock *BB, BoUpSLP &R,
19711 bool MaxVFOnly) {
19712 if (!R.canMapToVector(IVI->getType()))
19713 return false;
19714
19715 SmallVector<Value *, 16> BuildVectorOpds;
19716 SmallVector<Value *, 16> BuildVectorInsts;
19717 if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts))
19718 return false;
19719
19720 if (MaxVFOnly && BuildVectorOpds.size() == 2) {
19721 R.getORE()->emit([&]() {
19722 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IVI)
19723 << "Cannot SLP vectorize list: only 2 elements of buildvalue, "
19724 "trying reduction first.";
19725 });
19726 return false;
19727 }
19728 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
19729 // Aggregate value is unlikely to be processed in vector register.
19730 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
19731}
19732
19733bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
19734 BasicBlock *BB, BoUpSLP &R,
19735 bool MaxVFOnly) {
19736 SmallVector<Value *, 16> BuildVectorInsts;
19737 SmallVector<Value *, 16> BuildVectorOpds;
19739 if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) ||
19741 isFixedVectorShuffle(BuildVectorOpds, Mask)))
19742 return false;
19743
19744 if (MaxVFOnly && BuildVectorInsts.size() == 2) {
19745 R.getORE()->emit([&]() {
19746 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IEI)
19747 << "Cannot SLP vectorize list: only 2 elements of buildvector, "
19748 "trying reduction first.";
19749 });
19750 return false;
19751 }
19752 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
19753 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
19754}
19755
19756template <typename T>
19758 SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,
19759 function_ref<bool(T *, T *)> AreCompatible,
19760 function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
19761 bool MaxVFOnly, BoUpSLP &R) {
19762 bool Changed = false;
19763 // Sort by type, parent, operands.
19764 stable_sort(Incoming, Comparator);
19765
19766 // Try to vectorize elements base on their type.
19767 SmallVector<T *> Candidates;
19769 for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;
19770 VL.clear()) {
19771 // Look for the next elements with the same type, parent and operand
19772 // kinds.
19773 auto *I = dyn_cast<Instruction>(*IncIt);
19774 if (!I || R.isDeleted(I)) {
19775 ++IncIt;
19776 continue;
19777 }
19778 auto *SameTypeIt = IncIt;
19779 while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
19780 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
19781 AreCompatible(*SameTypeIt, *IncIt))) {
19782 auto *I = dyn_cast<Instruction>(*SameTypeIt);
19783 ++SameTypeIt;
19784 if (I && !R.isDeleted(I))
19785 VL.push_back(cast<T>(I));
19786 }
19787
19788 // Try to vectorize them.
19789 unsigned NumElts = VL.size();
19790 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("
19791 << NumElts << ")\n");
19792 // The vectorization is a 3-state attempt:
19793 // 1. Try to vectorize instructions with the same/alternate opcodes with the
19794 // size of maximal register at first.
19795 // 2. Try to vectorize remaining instructions with the same type, if
19796 // possible. This may result in the better vectorization results rather than
19797 // if we try just to vectorize instructions with the same/alternate opcodes.
19798 // 3. Final attempt to try to vectorize all instructions with the
19799 // same/alternate ops only, this may result in some extra final
19800 // vectorization.
19801 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL), MaxVFOnly)) {
19802 // Success start over because instructions might have been changed.
19803 Changed = true;
19804 VL.swap(Candidates);
19805 Candidates.clear();
19806 for (T *V : VL) {
19807 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
19808 Candidates.push_back(V);
19809 }
19810 } else {
19811 /// \Returns the minimum number of elements that we will attempt to
19812 /// vectorize.
19813 auto GetMinNumElements = [&R](Value *V) {
19814 unsigned EltSize = R.getVectorElementSize(V);
19815 return std::max(2U, R.getMaxVecRegSize() / EltSize);
19816 };
19817 if (NumElts < GetMinNumElements(*IncIt) &&
19818 (Candidates.empty() ||
19819 Candidates.front()->getType() == (*IncIt)->getType())) {
19820 for (T *V : VL) {
19821 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
19822 Candidates.push_back(V);
19823 }
19824 }
19825 }
19826 // Final attempt to vectorize instructions with the same types.
19827 if (Candidates.size() > 1 &&
19828 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
19829 if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) {
19830 // Success start over because instructions might have been changed.
19831 Changed = true;
19832 } else if (MaxVFOnly) {
19833 // Try to vectorize using small vectors.
19835 for (auto *It = Candidates.begin(), *End = Candidates.end(); It != End;
19836 VL.clear()) {
19837 auto *I = dyn_cast<Instruction>(*It);
19838 if (!I || R.isDeleted(I)) {
19839 ++It;
19840 continue;
19841 }
19842 auto *SameTypeIt = It;
19843 while (SameTypeIt != End &&
19844 (!isa<Instruction>(*SameTypeIt) ||
19845 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
19846 AreCompatible(*SameTypeIt, *It))) {
19847 auto *I = dyn_cast<Instruction>(*SameTypeIt);
19848 ++SameTypeIt;
19849 if (I && !R.isDeleted(I))
19850 VL.push_back(cast<T>(I));
19851 }
19852 unsigned NumElts = VL.size();
19853 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL),
19854 /*MaxVFOnly=*/false))
19855 Changed = true;
19856 It = SameTypeIt;
19857 }
19858 }
19859 Candidates.clear();
19860 }
19861
19862 // Start over at the next instruction of a different type (or the end).
19863 IncIt = SameTypeIt;
19864 }
19865 return Changed;
19866}
19867
19868/// Compare two cmp instructions. If IsCompatibility is true, function returns
19869/// true if 2 cmps have same/swapped predicates and mos compatible corresponding
19870/// operands. If IsCompatibility is false, function implements strict weak
19871/// ordering relation between two cmp instructions, returning true if the first
19872/// instruction is "less" than the second, i.e. its predicate is less than the
19873/// predicate of the second or the operands IDs are less than the operands IDs
19874/// of the second cmp instruction.
19875template <bool IsCompatibility>
19876static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
19877 const DominatorTree &DT) {
19878 assert(isValidElementType(V->getType()) &&
19879 isValidElementType(V2->getType()) &&
19880 "Expected valid element types only.");
19881 if (V == V2)
19882 return IsCompatibility;
19883 auto *CI1 = cast<CmpInst>(V);
19884 auto *CI2 = cast<CmpInst>(V2);
19885 if (CI1->getOperand(0)->getType()->getTypeID() <
19886 CI2->getOperand(0)->getType()->getTypeID())
19887 return !IsCompatibility;
19888 if (CI1->getOperand(0)->getType()->getTypeID() >
19889 CI2->getOperand(0)->getType()->getTypeID())
19890 return false;
19891 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
19893 return !IsCompatibility;
19894 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
19896 return false;
19897 CmpInst::Predicate Pred1 = CI1->getPredicate();
19898 CmpInst::Predicate Pred2 = CI2->getPredicate();
19901 CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1);
19902 CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2);
19903 if (BasePred1 < BasePred2)
19904 return !IsCompatibility;
19905 if (BasePred1 > BasePred2)
19906 return false;
19907 // Compare operands.
19908 bool CI1Preds = Pred1 == BasePred1;
19909 bool CI2Preds = Pred2 == BasePred1;
19910 for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {
19911 auto *Op1 = CI1->getOperand(CI1Preds ? I : E - I - 1);
19912 auto *Op2 = CI2->getOperand(CI2Preds ? I : E - I - 1);
19913 if (Op1 == Op2)
19914 continue;
19915 if (Op1->getValueID() < Op2->getValueID())
19916 return !IsCompatibility;
19917 if (Op1->getValueID() > Op2->getValueID())
19918 return false;
19919 if (auto *I1 = dyn_cast<Instruction>(Op1))
19920 if (auto *I2 = dyn_cast<Instruction>(Op2)) {
19921 if (IsCompatibility) {
19922 if (I1->getParent() != I2->getParent())
19923 return false;
19924 } else {
19925 // Try to compare nodes with same parent.
19926 DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(I1->getParent());
19927 DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(I2->getParent());
19928 if (!NodeI1)
19929 return NodeI2 != nullptr;
19930 if (!NodeI2)
19931 return false;
19932 assert((NodeI1 == NodeI2) ==
19933 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
19934 "Different nodes should have different DFS numbers");
19935 if (NodeI1 != NodeI2)
19936 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
19937 }
19938 InstructionsState S = getSameOpcode({I1, I2}, TLI);
19939 if (S.getOpcode() && (IsCompatibility || !S.isAltShuffle()))
19940 continue;
19941 if (IsCompatibility)
19942 return false;
19943 if (I1->getOpcode() != I2->getOpcode())
19944 return I1->getOpcode() < I2->getOpcode();
19945 }
19946 }
19947 return IsCompatibility;
19948}
19949
19950template <typename ItT>
19951bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
19952 BasicBlock *BB, BoUpSLP &R) {
19953 bool Changed = false;
19954 // Try to find reductions first.
19955 for (CmpInst *I : CmpInsts) {
19956 if (R.isDeleted(I))
19957 continue;
19958 for (Value *Op : I->operands())
19959 if (auto *RootOp = dyn_cast<Instruction>(Op))
19960 Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R, TTI);
19961 }
19962 // Try to vectorize operands as vector bundles.
19963 for (CmpInst *I : CmpInsts) {
19964 if (R.isDeleted(I))
19965 continue;
19966 Changed |= tryToVectorize(I, R);
19967 }
19968 // Try to vectorize list of compares.
19969 // Sort by type, compare predicate, etc.
19970 auto CompareSorter = [&](Value *V, Value *V2) {
19971 if (V == V2)
19972 return false;
19973 return compareCmp<false>(V, V2, *TLI, *DT);
19974 };
19975
19976 auto AreCompatibleCompares = [&](Value *V1, Value *V2) {
19977 if (V1 == V2)
19978 return true;
19979 return compareCmp<true>(V1, V2, *TLI, *DT);
19980 };
19981
19983 for (Instruction *V : CmpInsts)
19984 if (!R.isDeleted(V) && isValidElementType(getValueType(V)))
19985 Vals.push_back(V);
19986 if (Vals.size() <= 1)
19987 return Changed;
19989 Vals, CompareSorter, AreCompatibleCompares,
19990 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
19991 // Exclude possible reductions from other blocks.
19992 bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) {
19993 return any_of(V->users(), [V](User *U) {
19994 auto *Select = dyn_cast<SelectInst>(U);
19995 return Select &&
19996 Select->getParent() != cast<Instruction>(V)->getParent();
19997 });
19998 });
19999 if (ArePossiblyReducedInOtherBlock)
20000 return false;
20001 return tryToVectorizeList(Candidates, R, MaxVFOnly);
20002 },
20003 /*MaxVFOnly=*/true, R);
20004 return Changed;
20005}
20006
20007bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
20008 BasicBlock *BB, BoUpSLP &R) {
20010 "This function only accepts Insert instructions");
20011 bool OpsChanged = false;
20012 SmallVector<WeakTrackingVH> PostponedInsts;
20013 for (auto *I : reverse(Instructions)) {
20014 // pass1 - try to match and vectorize a buildvector sequence for MaxVF only.
20015 if (R.isDeleted(I) || isa<CmpInst>(I))
20016 continue;
20017 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
20018 OpsChanged |=
20019 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/true);
20020 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
20021 OpsChanged |=
20022 vectorizeInsertElementInst(LastInsertElem, BB, R, /*MaxVFOnly=*/true);
20023 }
20024 // pass2 - try to vectorize reductions only
20025 if (R.isDeleted(I))
20026 continue;
20027 OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, TTI, PostponedInsts);
20028 if (R.isDeleted(I) || isa<CmpInst>(I))
20029 continue;
20030 // pass3 - try to match and vectorize a buildvector sequence.
20031 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
20032 OpsChanged |=
20033 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/false);
20034 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
20035 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
20036 /*MaxVFOnly=*/false);
20037 }
20038 }
20039 // Now try to vectorize postponed instructions.
20040 OpsChanged |= tryToVectorize(PostponedInsts, R);
20041
20042 Instructions.clear();
20043 return OpsChanged;
20044}
20045
20046bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
20047 bool Changed = false;
20049 SmallPtrSet<Value *, 16> VisitedInstrs;
20050 // Maps phi nodes to the non-phi nodes found in the use tree for each phi
20051 // node. Allows better to identify the chains that can be vectorized in the
20052 // better way.
20054 auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) {
20056 isValidElementType(V2->getType()) &&
20057 "Expected vectorizable types only.");
20058 // It is fine to compare type IDs here, since we expect only vectorizable
20059 // types, like ints, floats and pointers, we don't care about other type.
20060 if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
20061 return true;
20062 if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
20063 return false;
20064 if (V1->getType()->getScalarSizeInBits() <
20065 V2->getType()->getScalarSizeInBits())
20066 return true;
20067 if (V1->getType()->getScalarSizeInBits() >
20068 V2->getType()->getScalarSizeInBits())
20069 return false;
20070 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
20071 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
20072 if (Opcodes1.size() < Opcodes2.size())
20073 return true;
20074 if (Opcodes1.size() > Opcodes2.size())
20075 return false;
20076 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
20077 {
20078 // Instructions come first.
20079 auto *I1 = dyn_cast<Instruction>(Opcodes1[I]);
20080 auto *I2 = dyn_cast<Instruction>(Opcodes2[I]);
20081 if (I1 && I2) {
20082 DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
20083 DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
20084 if (!NodeI1)
20085 return NodeI2 != nullptr;
20086 if (!NodeI2)
20087 return false;
20088 assert((NodeI1 == NodeI2) ==
20089 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
20090 "Different nodes should have different DFS numbers");
20091 if (NodeI1 != NodeI2)
20092 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
20093 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
20094 if (S.getOpcode() && !S.isAltShuffle())
20095 continue;
20096 return I1->getOpcode() < I2->getOpcode();
20097 }
20098 if (I1)
20099 return true;
20100 if (I2)
20101 return false;
20102 }
20103 {
20104 // Non-undef constants come next.
20105 bool C1 = isa<Constant>(Opcodes1[I]) && !isa<UndefValue>(Opcodes1[I]);
20106 bool C2 = isa<Constant>(Opcodes2[I]) && !isa<UndefValue>(Opcodes2[I]);
20107 if (C1 && C2)
20108 continue;
20109 if (C1)
20110 return true;
20111 if (C2)
20112 return false;
20113 }
20114 bool U1 = isa<UndefValue>(Opcodes1[I]);
20115 bool U2 = isa<UndefValue>(Opcodes2[I]);
20116 {
20117 // Non-constant non-instructions come next.
20118 if (!U1 && !U2) {
20119 auto ValID1 = Opcodes1[I]->getValueID();
20120 auto ValID2 = Opcodes2[I]->getValueID();
20121 if (ValID1 == ValID2)
20122 continue;
20123 if (ValID1 < ValID2)
20124 return true;
20125 if (ValID1 > ValID2)
20126 return false;
20127 }
20128 if (!U1)
20129 return true;
20130 if (!U2)
20131 return false;
20132 }
20133 // Undefs come last.
20134 assert(U1 && U2 && "The only thing left should be undef & undef.");
20135 }
20136 return false;
20137 };
20138 auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](Value *V1, Value *V2) {
20139 if (V1 == V2)
20140 return true;
20141 if (V1->getType() != V2->getType())
20142 return false;
20143 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
20144 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
20145 if (Opcodes1.size() != Opcodes2.size())
20146 return false;
20147 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
20148 // Undefs are compatible with any other value.
20149 if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
20150 continue;
20151 if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
20152 if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
20153 if (R.isDeleted(I1) || R.isDeleted(I2))
20154 return false;
20155 if (I1->getParent() != I2->getParent())
20156 return false;
20157 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
20158 if (S.getOpcode())
20159 continue;
20160 return false;
20161 }
20162 if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
20163 continue;
20164 if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
20165 return false;
20166 }
20167 return true;
20168 };
20169
20170 bool HaveVectorizedPhiNodes = false;
20171 do {
20172 // Collect the incoming values from the PHIs.
20173 Incoming.clear();
20174 for (Instruction &I : *BB) {
20175 auto *P = dyn_cast<PHINode>(&I);
20176 if (!P || P->getNumIncomingValues() > MaxPHINumOperands)
20177 break;
20178
20179 // No need to analyze deleted, vectorized and non-vectorizable
20180 // instructions.
20181 if (!VisitedInstrs.count(P) && !R.isDeleted(P) &&
20182 isValidElementType(P->getType()))
20183 Incoming.push_back(P);
20184 }
20185
20186 if (Incoming.size() <= 1)
20187 break;
20188
20189 // Find the corresponding non-phi nodes for better matching when trying to
20190 // build the tree.
20191 for (Value *V : Incoming) {
20192 SmallVectorImpl<Value *> &Opcodes =
20193 PHIToOpcodes.try_emplace(V).first->getSecond();
20194 if (!Opcodes.empty())
20195 continue;
20196 SmallVector<Value *, 4> Nodes(1, V);
20198 while (!Nodes.empty()) {
20199 auto *PHI = cast<PHINode>(Nodes.pop_back_val());
20200 if (!Visited.insert(PHI).second)
20201 continue;
20202 for (Value *V : PHI->incoming_values()) {
20203 if (auto *PHI1 = dyn_cast<PHINode>((V))) {
20204 Nodes.push_back(PHI1);
20205 continue;
20206 }
20207 Opcodes.emplace_back(V);
20208 }
20209 }
20210 }
20211
20212 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
20213 Incoming, PHICompare, AreCompatiblePHIs,
20214 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
20215 return tryToVectorizeList(Candidates, R, MaxVFOnly);
20216 },
20217 /*MaxVFOnly=*/true, R);
20218 Changed |= HaveVectorizedPhiNodes;
20219 if (HaveVectorizedPhiNodes && any_of(PHIToOpcodes, [&](const auto &P) {
20220 auto *PHI = dyn_cast<PHINode>(P.first);
20221 return !PHI || R.isDeleted(PHI);
20222 }))
20223 PHIToOpcodes.clear();
20224 VisitedInstrs.insert(Incoming.begin(), Incoming.end());
20225 } while (HaveVectorizedPhiNodes);
20226
20227 VisitedInstrs.clear();
20228
20229 InstSetVector PostProcessInserts;
20230 SmallSetVector<CmpInst *, 8> PostProcessCmps;
20231 // Vectorizes Inserts in `PostProcessInserts` and if `VecctorizeCmps` is true
20232 // also vectorizes `PostProcessCmps`.
20233 auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
20234 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
20235 if (VectorizeCmps) {
20236 Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R);
20237 PostProcessCmps.clear();
20238 }
20239 PostProcessInserts.clear();
20240 return Changed;
20241 };
20242 // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
20243 auto IsInPostProcessInstrs = [&](Instruction *I) {
20244 if (auto *Cmp = dyn_cast<CmpInst>(I))
20245 return PostProcessCmps.contains(Cmp);
20247 PostProcessInserts.contains(I);
20248 };
20249 // Returns true if `I` is an instruction without users, like terminator, or
20250 // function call with ignored return value, store. Ignore unused instructions
20251 // (basing on instruction type, except for CallInst and InvokeInst).
20252 auto HasNoUsers = [](Instruction *I) {
20253 return I->use_empty() &&
20254 (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I));
20255 };
20256 for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
20257 // Skip instructions with scalable type. The num of elements is unknown at
20258 // compile-time for scalable type.
20259 if (isa<ScalableVectorType>(It->getType()))
20260 continue;
20261
20262 // Skip instructions marked for the deletion.
20263 if (R.isDeleted(&*It))
20264 continue;
20265 // We may go through BB multiple times so skip the one we have checked.
20266 if (!VisitedInstrs.insert(&*It).second) {
20267 if (HasNoUsers(&*It) &&
20268 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {
20269 // We would like to start over since some instructions are deleted
20270 // and the iterator may become invalid value.
20271 Changed = true;
20272 It = BB->begin();
20273 E = BB->end();
20274 }
20275 continue;
20276 }
20277
20278 if (isa<DbgInfoIntrinsic>(It))
20279 continue;
20280
20281 // Try to vectorize reductions that use PHINodes.
20282 if (PHINode *P = dyn_cast<PHINode>(It)) {
20283 // Check that the PHI is a reduction PHI.
20284 if (P->getNumIncomingValues() == 2) {
20285 // Try to match and vectorize a horizontal reduction.
20286 Instruction *Root = getReductionInstr(DT, P, BB, LI);
20287 if (Root && vectorizeRootInstruction(P, Root, BB, R, TTI)) {
20288 Changed = true;
20289 It = BB->begin();
20290 E = BB->end();
20291 continue;
20292 }
20293 }
20294 // Try to vectorize the incoming values of the PHI, to catch reductions
20295 // that feed into PHIs.
20296 for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) {
20297 // Skip if the incoming block is the current BB for now. Also, bypass
20298 // unreachable IR for efficiency and to avoid crashing.
20299 // TODO: Collect the skipped incoming values and try to vectorize them
20300 // after processing BB.
20301 if (BB == P->getIncomingBlock(I) ||
20302 !DT->isReachableFromEntry(P->getIncomingBlock(I)))
20303 continue;
20304
20305 // Postponed instructions should not be vectorized here, delay their
20306 // vectorization.
20307 if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I));
20308 PI && !IsInPostProcessInstrs(PI)) {
20309 bool Res = vectorizeRootInstruction(nullptr, PI,
20310 P->getIncomingBlock(I), R, TTI);
20311 Changed |= Res;
20312 if (Res && R.isDeleted(P)) {
20313 It = BB->begin();
20314 E = BB->end();
20315 break;
20316 }
20317 }
20318 }
20319 continue;
20320 }
20321
20322 if (HasNoUsers(&*It)) {
20323 bool OpsChanged = false;
20324 auto *SI = dyn_cast<StoreInst>(It);
20325 bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;
20326 if (SI) {
20327 auto *I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));
20328 // Try to vectorize chain in store, if this is the only store to the
20329 // address in the block.
20330 // TODO: This is just a temporarily solution to save compile time. Need
20331 // to investigate if we can safely turn on slp-vectorize-hor-store
20332 // instead to allow lookup for reduction chains in all non-vectorized
20333 // stores (need to check side effects and compile time).
20334 TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) &&
20335 SI->getValueOperand()->hasOneUse();
20336 }
20337 if (TryToVectorizeRoot) {
20338 for (auto *V : It->operand_values()) {
20339 // Postponed instructions should not be vectorized here, delay their
20340 // vectorization.
20341 if (auto *VI = dyn_cast<Instruction>(V);
20342 VI && !IsInPostProcessInstrs(VI))
20343 // Try to match and vectorize a horizontal reduction.
20344 OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R, TTI);
20345 }
20346 }
20347 // Start vectorization of post-process list of instructions from the
20348 // top-tree instructions to try to vectorize as many instructions as
20349 // possible.
20350 OpsChanged |=
20351 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());
20352 if (OpsChanged) {
20353 // We would like to start over since some instructions are deleted
20354 // and the iterator may become invalid value.
20355 Changed = true;
20356 It = BB->begin();
20357 E = BB->end();
20358 continue;
20359 }
20360 }
20361
20363 PostProcessInserts.insert(&*It);
20364 else if (isa<CmpInst>(It))
20365 PostProcessCmps.insert(cast<CmpInst>(&*It));
20366 }
20367
20368 return Changed;
20369}
20370
20371bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
20372 auto Changed = false;
20373 for (auto &Entry : GEPs) {
20374 // If the getelementptr list has fewer than two elements, there's nothing
20375 // to do.
20376 if (Entry.second.size() < 2)
20377 continue;
20378
20379 LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
20380 << Entry.second.size() << ".\n");
20381
20382 // Process the GEP list in chunks suitable for the target's supported
20383 // vector size. If a vector register can't hold 1 element, we are done. We
20384 // are trying to vectorize the index computations, so the maximum number of
20385 // elements is based on the size of the index expression, rather than the
20386 // size of the GEP itself (the target's pointer size).
20387 auto *It = find_if(Entry.second, [&](GetElementPtrInst *GEP) {
20388 return !R.isDeleted(GEP);
20389 });
20390 if (It == Entry.second.end())
20391 continue;
20392 unsigned MaxVecRegSize = R.getMaxVecRegSize();
20393 unsigned EltSize = R.getVectorElementSize(*(*It)->idx_begin());
20394 if (MaxVecRegSize < EltSize)
20395 continue;
20396
20397 unsigned MaxElts = MaxVecRegSize / EltSize;
20398 for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
20399 auto Len = std::min<unsigned>(BE - BI, MaxElts);
20400 ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
20401
20402 // Initialize a set a candidate getelementptrs. Note that we use a
20403 // SetVector here to preserve program order. If the index computations
20404 // are vectorizable and begin with loads, we want to minimize the chance
20405 // of having to reorder them later.
20406 SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
20407
20408 // Some of the candidates may have already been vectorized after we
20409 // initially collected them or their index is optimized to constant value.
20410 // If so, they are marked as deleted, so remove them from the set of
20411 // candidates.
20412 Candidates.remove_if([&R](Value *I) {
20413 return R.isDeleted(cast<Instruction>(I)) ||
20414 isa<Constant>(cast<GetElementPtrInst>(I)->idx_begin()->get());
20415 });
20416
20417 // Remove from the set of candidates all pairs of getelementptrs with
20418 // constant differences. Such getelementptrs are likely not good
20419 // candidates for vectorization in a bottom-up phase since one can be
20420 // computed from the other. We also ensure all candidate getelementptr
20421 // indices are unique.
20422 for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
20423 auto *GEPI = GEPList[I];
20424 if (!Candidates.count(GEPI))
20425 continue;
20426 const SCEV *SCEVI = SE->getSCEV(GEPList[I]);
20427 for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
20428 auto *GEPJ = GEPList[J];
20429 const SCEV *SCEVJ = SE->getSCEV(GEPList[J]);
20430 if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
20431 Candidates.remove(GEPI);
20432 Candidates.remove(GEPJ);
20433 } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
20434 Candidates.remove(GEPJ);
20435 }
20436 }
20437 }
20438
20439 // We break out of the above computation as soon as we know there are
20440 // fewer than two candidates remaining.
20441 if (Candidates.size() < 2)
20442 continue;
20443
20444 // Add the single, non-constant index of each candidate to the bundle. We
20445 // ensured the indices met these constraints when we originally collected
20446 // the getelementptrs.
20447 SmallVector<Value *, 16> Bundle(Candidates.size());
20448 auto BundleIndex = 0u;
20449 for (auto *V : Candidates) {
20450 auto *GEP = cast<GetElementPtrInst>(V);
20451 auto *GEPIdx = GEP->idx_begin()->get();
20452 assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
20453 Bundle[BundleIndex++] = GEPIdx;
20454 }
20455
20456 // Try and vectorize the indices. We are currently only interested in
20457 // gather-like cases of the form:
20458 //
20459 // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
20460 //
20461 // where the loads of "a", the loads of "b", and the subtractions can be
20462 // performed in parallel. It's likely that detecting this pattern in a
20463 // bottom-up phase will be simpler and less costly than building a
20464 // full-blown top-down phase beginning at the consecutive loads.
20465 Changed |= tryToVectorizeList(Bundle, R);
20466 }
20467 }
20468 return Changed;
20469}
20470
20471bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
20472 bool Changed = false;
20473 // Sort by type, base pointers and values operand. Value operands must be
20474 // compatible (have the same opcode, same parent), otherwise it is
20475 // definitely not profitable to try to vectorize them.
20476 auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {
20477 if (V->getValueOperand()->getType()->getTypeID() <
20478 V2->getValueOperand()->getType()->getTypeID())
20479 return true;
20480 if (V->getValueOperand()->getType()->getTypeID() >
20481 V2->getValueOperand()->getType()->getTypeID())
20482 return false;
20483 if (V->getPointerOperandType()->getTypeID() <
20484 V2->getPointerOperandType()->getTypeID())
20485 return true;
20486 if (V->getPointerOperandType()->getTypeID() >
20487 V2->getPointerOperandType()->getTypeID())
20488 return false;
20489 if (V->getValueOperand()->getType()->getScalarSizeInBits() <
20490 V2->getValueOperand()->getType()->getScalarSizeInBits())
20491 return true;
20492 if (V->getValueOperand()->getType()->getScalarSizeInBits() >
20493 V2->getValueOperand()->getType()->getScalarSizeInBits())
20494 return false;
20495 // UndefValues are compatible with all other values.
20496 if (isa<UndefValue>(V->getValueOperand()) ||
20497 isa<UndefValue>(V2->getValueOperand()))
20498 return false;
20499 if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand()))
20500 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
20502 DT->getNode(I1->getParent());
20504 DT->getNode(I2->getParent());
20505 assert(NodeI1 && "Should only process reachable instructions");
20506 assert(NodeI2 && "Should only process reachable instructions");
20507 assert((NodeI1 == NodeI2) ==
20508 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
20509 "Different nodes should have different DFS numbers");
20510 if (NodeI1 != NodeI2)
20511 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
20512 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
20513 if (S.getOpcode())
20514 return false;
20515 return I1->getOpcode() < I2->getOpcode();
20516 }
20517 if (isa<Constant>(V->getValueOperand()) &&
20518 isa<Constant>(V2->getValueOperand()))
20519 return false;
20520 return V->getValueOperand()->getValueID() <
20521 V2->getValueOperand()->getValueID();
20522 };
20523
20524 auto &&AreCompatibleStores = [this](StoreInst *V1, StoreInst *V2) {
20525 if (V1 == V2)
20526 return true;
20527 if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
20528 return false;
20529 if (V1->getPointerOperandType() != V2->getPointerOperandType())
20530 return false;
20531 // Undefs are compatible with any other value.
20532 if (isa<UndefValue>(V1->getValueOperand()) ||
20533 isa<UndefValue>(V2->getValueOperand()))
20534 return true;
20535 if (auto *I1 = dyn_cast<Instruction>(V1->getValueOperand()))
20536 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
20537 if (I1->getParent() != I2->getParent())
20538 return false;
20539 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
20540 return S.getOpcode() > 0;
20541 }
20542 if (isa<Constant>(V1->getValueOperand()) &&
20543 isa<Constant>(V2->getValueOperand()))
20544 return true;
20545 return V1->getValueOperand()->getValueID() ==
20546 V2->getValueOperand()->getValueID();
20547 };
20548
20549 // Attempt to sort and vectorize each of the store-groups.
20551 for (auto &Pair : Stores) {
20552 if (Pair.second.size() < 2)
20553 continue;
20554
20555 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
20556 << Pair.second.size() << ".\n");
20557
20558 if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))
20559 continue;
20560
20561 // Reverse stores to do bottom-to-top analysis. This is important if the
20562 // values are stores to the same addresses several times, in this case need
20563 // to follow the stores order (reversed to meet the memory dependecies).
20564 SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
20565 Pair.second.rend());
20567 ReversedStores, StoreSorter, AreCompatibleStores,
20568 [&](ArrayRef<StoreInst *> Candidates, bool) {
20569 return vectorizeStores(Candidates, R, Attempted);
20570 },
20571 /*MaxVFOnly=*/false, R);
20572 }
20573 return Changed;
20574}
aarch64 AArch64 CCMP Pass
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
static bool isConstant(const MachineInstr &MI)
AMDGPU Lower Kernel Arguments
amdgpu AMDGPU Register Bank Select
Rewrite undef for PHI
ReachingDefAnalysis InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:604
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
#define LLVM_DEBUG(X)
Definition Debug.h:101
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
std::string Name
uint64_t Size
bool End
DenseMap< Block *, BlockRelaxAux > Blocks
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool runImpl(Function &F, const TargetLowering &TLI)
This is the interface for a simple mod/ref and alias analysis over globals.
static const HTTPClientCleanup Cleanup
Hexagon Common GEP
#define _
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
Definition IVUsers.cpp:48
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
Loop::LoopBounds::Direction Direction
Definition LoopInfo.cpp:231
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
#define T
uint64_t IntrinsicInst * II
return ToRemove size() > 0
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
static bool IsSelect(MachineInstr &MI)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
const MachineOperand & RHS
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static std::optional< Value * > calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, Instruction *Inst=nullptr)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
static bool needToScheduleSingleInstruction(ArrayRef< Value * > VL)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static Type * getValueType(T *V)
Returns the type of the given value/instruction V.
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, unsigned Opcode0, unsigned Opcode1)
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static bool isValidForAlternation(unsigned Opcode)
static std::optional< unsigned > getExtractIndex(Instruction *E)
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static void findBuildAggregate_rec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset)
static bool isCommutative(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(T *, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI, unsigned BaseIndex=0)
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool isAlternateInstruction(const Instruction *I, const Instruction *MainOp, const Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
#define SV_NAME
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:166
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:40
This pass exposes codegen information to IR-level passes.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:191
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
Definition VPlanSLP.cpp:154
Value * LHS
static const uint32_t IV[8]
Definition blake3_impl.h:78
Merges shuffle masks and emits final shuffle instruction, if required.
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
A manager for alias analyses.
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:227
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1323
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:364
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:373
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition APInt.cpp:1641
void clearAllBits()
Set every bit to 0.
Definition APInt.h:1390
void setAllBits()
Set every bit to 1.
Definition APInt.h:1312
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
Definition APInt.h:1360
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:193
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition APInt.h:279
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:232
A container for analyses that lazily runs them and caches their results.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
Definition ArrayRef.h:187
const T & back() const
back - Get the last element.
Definition ArrayRef.h:174
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Definition ArrayRef.h:228
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition ArrayRef.h:204
const T & front() const
front - Get the first element.
Definition ArrayRef.h:168
iterator end() const
Definition ArrayRef.h:154
size_t size() const
size - Get the array size.
Definition ArrayRef.h:165
iterator begin() const
Definition ArrayRef.h:153
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:160
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition ArrayRef.h:195
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
Definition BasicBlock.h:61
iterator end()
Definition BasicBlock.h:461
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:448
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:219
InstListType::reverse_iterator reverse_iterator
Definition BasicBlock.h:179
reverse_iterator rend()
Definition BasicBlock.h:466
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:177
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:239
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:72
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
iterator_range< bundle_op_iterator > bundle_op_infos()
Return the range [bundle_op_info_begin, bundle_op_info_end).
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Value * getArgOperand(unsigned i) const
FunctionType * getFunctionType() const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
bool hasOperandBundles() const
Return true if this User has any operand bundles.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Definition InstrTypes.h:444
This class is the base class for the comparison instructions.
Definition InstrTypes.h:661
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:673
@ ICMP_SLT
signed less than
Definition InstrTypes.h:702
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:703
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:697
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:696
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:700
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:698
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:701
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:699
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Definition InstrTypes.h:825
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:787
Predicate getPredicate() const
Return the predicate for this instruction.
Definition InstrTypes.h:763
static Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
This is the shared class of boolean and integer constants.
Definition Constants.h:83
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:157
static Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:42
static Constant * getAllOnesValue(Type *Ty)
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Constant * getAggregateElement(unsigned Elt) const
For aggregates (struct/array/vector) return the constant that corresponds to the specified element if...
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
TypeSize getTypeStoreSizeInBits(Type *Ty) const
Returns the maximum number of bits that may be overwritten by storing the specified type; always a mu...
Definition DataLayout.h:436
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition DataLayout.h:619
An analysis that produces DemandedBits for a function.
APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:194
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:156
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:226
bool erase(const KeyT &Val)
Definition DenseMap.h:321
unsigned size() const
Definition DenseMap.h:99
bool empty() const
Definition DenseMap.h:98
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition DenseMap.h:152
iterator end()
Definition DenseMap.h:84
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Definition DenseMap.h:202
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:147
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:211
Implements a dense probed hash-table based set.
Definition DenseSet.h:278
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
Definition Dominators.h:279
void updateDFSNumbers() const
updateDFSNumbers - Assign In and Out numbers to the nodes while walking dominator tree in dfs order.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:162
bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:311
This instruction extracts a single (scalar) element from a VectorType value.
This instruction extracts a struct member or array element value from an aggregate value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:20
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:689
ArrayRef< Type * > params() const
Type * getReturnType() const
bool empty() const
Definition Function.h:856
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
For the node iterator we just need to turn the TreeEntry iterator into a TreeEntry* iterator so that ...
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Definition IRBuilder.h:91
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2692
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
This instruction inserts a struct field of array element value into an aggregate value.
static InstructionCost getInvalid(CostType Val=0)
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
bool isCast() const
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
bool isBinaryOp() const
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
const Instruction * getNextNonDebugInstruction(bool SkipPseudoOp=false) const
Return a pointer to the next non-debug instruction in the same basic block as 'this',...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
bool isSafeToRemove() const LLVM_READONLY
Return true if the instruction can be removed if the result is unused.
bool isIntDivRem() const
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:275
An instruction for reading from memory.
Value * getPointerOperand()
bool isSimple() const
Align getAlign() const
Return the alignment of the access that is being performed.
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:566
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:39
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
size_type count(const KeyT &Key) const
Definition MapVector.h:165
iterator end()
Definition MapVector.h:71
VectorType takeVector()
Clear the MapVector and return the underlying vector.
Definition MapVector.h:55
iterator find(const KeyT &Key)
Definition MapVector.h:167
bool empty() const
Definition MapVector.h:79
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:118
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition MapVector.h:141
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:110
size_type size() const
Definition MapVector.h:60
std::pair< KeyT, ValueT > & front()
Definition MapVector.h:83
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
T & front() const
front - Get the first element.
Definition ArrayRef.h:363
iterator end() const
Definition ArrayRef.h:357
iterator begin() const
Definition ArrayRef.h:356
MutableArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition ArrayRef.h:376
The optimization diagnostic interface.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
This is a MutableArrayRef that owns its array.
Definition ArrayRef.h:449
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
Definition Pass.h:94
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
T get() const
Returns the value of the specified pointer type.
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:111
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:117
void preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:146
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class uses information about analyze scalars to rewrite expressions in canonical form.
Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
This class represents an analyzed expression in the program.
bool isZero() const
Return true if the expression is a constant zero.
bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition SetVector.h:57
ArrayRef< value_type > getArrayRef() const
Definition SetVector.h:84
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:98
void clear()
Completely clear the SetVector.
Definition SetVector.h:273
bool empty() const
Determine if the SetVector is empty or not.
Definition SetVector.h:93
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:162
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
Definition SetVector.h:254
This instruction constructs a fixed permutation of two input vectors.
static bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
SmallBitVector & set()
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
SmallBitVector & reset()
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:298
size_type size() const
Definition SmallPtrSet.h:95
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
iterator end() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
A SetVector that performs no allocations if smaller than a certain size.
Definition LoopUtils.h:52
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:132
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition SmallSet.h:175
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition SmallSet.h:222
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:181
size_type size() const
Definition SmallSet.h:170
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition MD5.h:38
size_t size() const
Definition SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Type * getPointerOperandType() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:51
TargetFolder - Create constants with target dependent folding.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo Op1Info={OK_AnyValue, OP_None}, OperandValueInfo Op2Info={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
TypeSize getRegisterBitWidth(RegisterKind K) const
bool isLegalMaskedGather(Type *DataType, Align Alignment) const
Return true if the target supports masked gather.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType=nullptr, TargetCostKind CostKind=TCK_SizeAndLatency) const
Estimate the cost of a GEP operation when lowered.
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const
\Returns true if the target supports broadcasting a load to a vector of type <NumElements x ElementTy...
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
bool forceScalarizeMaskedGather(VectorType *Type, Align Alignment) const
Return true if the target forces scalarizing of llvm.masked.gather intrinsics.
bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const
Return true if the target supports strided load.
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
OperandValueProperties
Additional properties of an operand's values.
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const PointersChainInfo &Info, Type *AccessTy, TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Estimate the cost of a chain of pointers (typically pointer operands of a chain of loads or stores wi...
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
bool isTypeLegal(Type *Ty) const
Return true if this type is legal.
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const
InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
unsigned getMinVectorRegisterBitWidth() const
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
unsigned getNumberOfRegisters(unsigned ClassID) const
bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const
Return true if this is an alternating opcode pattern that can be lowered to a single instruction on t...
bool isFPVectorizationPotentiallyUnsafe() const
Indicate that it is potentially unsafe to automatically vectorize floating-point operations because t...
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const
@ TCC_Expensive
The cost of a 'div' instruction on x86.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index) const
unsigned getNumberOfParts(Type *Tp) const
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind) const
Estimate the overhead of scalarizing an instruction.
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, Value *Op0=nullptr, Value *Op1=nullptr) const
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency) const
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ GatherScatter
The cast is used with a gather/scatter.
OperandValueKind
Additional information about an operand's possible values.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:81
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
Definition Type.cpp:149
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:257
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition Type.h:230
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:251
unsigned getStructNumElements() const
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Definition Type.h:282
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:342
Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:199
static IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:248
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition Type.h:254
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:224
TypeID getTypeID() const
Return the type id for the type.
Definition Type.h:136
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition Type.h:212
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:43
op_range operands()
Definition User.h:288
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition User.cpp:21
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
Definition User.h:115
op_iterator op_begin()
Definition User.h:280
Value * getOperand(unsigned i) const
Definition User.h:228
unsigned getNumOperands() const
Definition User.h:250
iterator_range< value_op_iterator > operand_values()
Definition User.h:312
The Vector Function Database.
Definition VectorUtils.h:30
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition VectorUtils.h:71
LLVM Value Representation.
Definition Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
user_iterator user_begin()
Definition Value.h:397
const Value * stripInBoundsConstantOffsets() const
Strip off pointer casts and all-constant inbounds GEPs.
Definition Value.cpp:706
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:434
iterator_range< user_iterator > users()
Definition Value.h:421
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition Value.h:532
bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
Definition Value.cpp:153
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:149
bool use_empty() const
Definition Value.h:344
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1075
unsigned getNumUses() const
This method computes the number of uses of this Value.
Definition Value.cpp:255
StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:309
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
Value handle that is nullable, but tries to track the Value.
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:213
size_type size() const
Definition DenseSet.h:81
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:193
bool erase(const ValueT &V)
Definition DenseSet.h:97
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:95
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:202
An efficient, type-erasing, non-owning reference to a callable.
Definition Attributor.h:150
An opaque object representing a hash code.
Definition Hashing.h:75
const ParentTy * getParent() const
Definition ilist_node.h:32
self_iterator getIterator()
Definition ilist_node.h:132
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:353
CRTP base class for adapting an iterator to a different type.
Definition iterator.h:237
A range adaptor for a pair of iterators.
Definition SmallVector.h:37
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:52
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
VLOperands(ArrayRef< Value * > RootVL, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
Bottom Up SLP Vectorizer.
SmallVector< unsigned, 4 > OrdersType
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
friend raw_ostream & operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD)
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals={})
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
SmallPtrSet< Value *, 16 > ValueSet
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom)
Gets reordering data for the given tree entry.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
MapVector< Value *, SmallVector< Instruction *, 2 > > ExtraValueToDebugLocsMap
bool doesRootHaveInTreeUses() const
Returns whether the root node has in-tree uses.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
bool isVectorized(Value *V) const
Check if the value is vectorized in the tree.
bool isIdentityOrder(ArrayRef< unsigned > Order) const
Does this non-empty order represent an identity order? Identity should be represented as an empty ord...
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
InstructionCost getSpillCost() const
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
CallInst * Call
Changed
Function * getVectorizedFunction(const VFShape &Shape) const
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Key
PAL metadata keys.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
initializer< Ty > init(const Ty &Val)
DiagnosticInfoOptimizationBase::Argument NV
bool empty() const
Definition BasicBlock.h:100
Instruction & front() const
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:329
std::optional< int > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
@ Offset
Definition DWP.cpp:480
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:853
void stable_sort(R &&Range)
Definition STLExtras.h:2021
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1743
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1716
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1723
hash_code hash_value(const FixedPointSemantics &Val)
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:546
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &DL, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
InstructionCost Cost
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition ScopeExit.h:59
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2432
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
Definition Utils.cpp:1662
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition Error.h:198
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:738
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:656
auto cast_or_null(const Y &Val)
Definition Casting.h:720
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:555
iterator_range< po_iterator< T > > post_order(const T &G)
MaybeAlign getAlign(const Function &F, unsigned Index)
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:682
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition bit.h:342
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:394
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:759
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition STLExtras.h:2091
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition STLExtras.h:1936
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:146
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1730
bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition Local.cpp:406
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:340
llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
Definition STLExtras.h:419
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1648
llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
Definition STLExtras.h:1755
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:163
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
bool isPointerTy(const Type *T)
Definition SPIRVUtils.h:197
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
Definition Local.cpp:425
bool isModOrRefSet(const ModRefInfo MRI)
Definition ModRef.h:42
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr int PoisonMaskElem
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:403
@ Other
Any other memory.
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
TargetTransformInfo TTI
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
@ None
Not a recurrence.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:155
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:1922
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:1998
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
ArrayRef(const T &OneElt) -> ArrayRef< T >
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1825
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1929
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1750
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1887
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx)
Identifies if the vector form of the intrinsic has a scalar operand.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:593
bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:327
Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:836
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:471
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:860
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Used to keep track of an operand bundle.
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
Used in the streaming interface as the general argument type.
Add the VectorizableTree to the index iterator to be able to return TreeEntry pointers.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
static nodes_iterator nodes_end(BoUpSLP *R)
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
Direction
An enum for the direction of the loop.
Definition LoopInfo.h:215
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:117
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
A MapVector that performs no allocations if smaller than a certain size.
Definition MapVector.h:254
Describe known properties for a set of pointers.
Contains the information about the kind of vectorization available.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Definition STLExtras.h:1451
Function object to check whether the second component of a container supported by std::get (like std:...
Definition STLExtras.h:1460
This structure holds any data we need about the edges being traversed during buildTree_rec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.
bool operator==(const EdgeInfo &Other) const